Assembler Code verstehen: wo anfangen

camper

shisha schrieb:

Zum Vergleich hier mal die Ausgabe des C++ Programmes für den Anfang der FUnktion:

mov     0x38(%rdi),%rcx                                                             mov     %edi,%eax
mov     %rdx,%r8                                                                    mov     $0x51eb851f,%edx
movslq  %esi,%rax                                                                   mov     %edi,%ecx
xor     %edx,%edx                                                                   imul    %edx
div     %rcx

Das sieht allerdings nicht so aus, als ob die C++-Variante wirklich äquivalent ist.
In C++ scheint der erste Parameter eine Referenz auf ein Objekt zu sein, dass den Divisor als Member enthält. Während in der C-Variante der Divisor offenbar kein Funktionsparameter ist.

Kannst du die beiden Varianten zeigen (vorzugsweise mit allen notwendingen zusätzlichen Deklarationen, um es compilieren zu können)?

Den gesamten Quellcode kann ich leider nicht freigeben, das wäre viel zu viel.
Aber prinzipiell hast du Recht.
So sieht es ungefähr aus:

C:

double heat_2d(int i, double t, const double *y)
{
  const int N = HEAT2D_N;

  const int ri = i / N;
  const int ci = i % N;

  if (ri > 0)
  { // not the top row
    if (ri < N - 1)
    { // not the bottom row
      if (ci > 0)
      { // not the left column
        if (ci < N - 1)
          return stencil_inner(i, y, N); // inner point
        else                             // ci == N - 1
          return stencil_right(i, y, N); // right column
      }
      else                            // ci == 0
        return stencil_left(i, y, N); // left column
    }
    else // ri == N - 1
    {    // bottom row
      if (ci > 0)
      { // not the bottom left corner
        if (ci < N - 1)
          return stencil_bottom(i, y, N); // non-corner point of bottom row
        else                              // ci == N - 1
          return stencil_corner_bottom_right(y, N);
      }
      else // ci == 0
        return stencil_corner_bottom_left(y, N);
    }
  }
  else // ri == 0
  {    // top row
    if (ci > 0)
    { // not the top left corner
      if (ci < N - 1)
        return stencil_top(i, y, N); // non-corner point of top row
      else                           // ci == N - 1
        return stencil_corner_top_right(y, N);
    }
    else // ci == 0
      return stencil_corner_top_left(y, N);
  }
}

C++:

template<typename scalar_type = double, typename state_type = std::vector<scalar_type>>
class Heat2D : public IVP<scalar_type, state_type>
{
public:

  Heat2D() : IVP<scalar_type, state_type>(HEAT2D_n)
  {
    const int N = HEAT2D_N;

    for (int i = 0; i < N; i++)
      for (int j = 0; j < N; j++)
      {
        this->initial[i * N + j] = 0.5 + (double) j / (double) (N - 1);
      }
  }

  Heat2D(int N) : IVP<scalar_type, state_type>(N * N), N(N)
  {
    for (int i = 0; i < N; i++)
    {
      for (int j = 0; j < N; j++)
      {
        this->initial[i * N + j] = 0.5 + (double) j / (double) (N - 1);
      }
    }
  }

  scalar_type eval_component(int i, scalar_type t, const state_type &y)
  {
    const int ri = i / N;
    const int ci = i % N;

    if (ri > 0)
    { // not the top row
      if (ri < N - 1)
      { // not the bottom row
        if (ci > 0)
        { // not the left column
          if (ci < N - 1)
            return stencil_inner(i, y, N); // inner point
          else                             // ci == N - 1
            return stencil_right(i, y, N); // right column
        }
        else                            // ci == 0
          return stencil_left(i, y, N); // left column
      }
      else // ri == N - 1
      {    // bottom row
        if (ci > 0)
        { // not the bottom left corner
          if (ci < N - 1)
            return stencil_bottom(i, y, N); // non-corner point of bottom row
          else                              // ci == N - 1
            return stencil_corner_bottom_right(y, N);
        }
        else // ci == 0
          return stencil_corner_bottom_left(y, N);
      }
    }
    else // ri == 0
    {    // top row
      if (ci > 0)
      { // not the top left corner
        if (ci < N - 1)
          return stencil_top(i, y, N); // non-corner point of top row
        else                           // ci == N - 1
          return stencil_corner_top_right(y, N);
      }
      else // ci == 0
        return stencil_corner_top_left(y, N);
    }
  }
}

Die Stencils sind hier einfache 5-Punkt 2D jacobis, z.B::

// C++
scalar_type stencil_inner(int i, const state_type &y, int N)
  {
    scalar_type N12 = (((scalar_type) N - 1.0) * ((scalar_type) N - 1.0));
    return N12 * (y[i - N] + y[i - 1] - 4.0 * y[i] + y[i + 1] + y[i + N]);
  }

// C
static double stencil_inner(int i, const double *y, int N)
{
  double N12 = (((double)N - 1.0) * ((double)N - 1.0));
  return N12 * (y[i - N] + y[i - 1] - 4.0 * y[i] + y[i + 1] + y[i + N]);
}

camper

Könnte man das nicht viel kürzer haben? Etwa:

scalar_type eval_component(int i, scalar_type t, const state_type &y)
  {
    const int ri = i / N;
    const int ci = i % N;
    return (N - 1) * (N - 1) * (y[ri     == 0 ? i : i - N]
                              + y[ci     == 0 ? i : i - 1]
                              + y[ci + 1 == N ? i : i + 1]
                              + y[ri + 1 == N ? i : i + N]
                              - y[i] * 4);
  }

Wenn das Ganze in einer Schleife aufgerufen wird, sind die ganzen Bedingungen ohnehin redundant.

Die Ränder unterscheiden sich durchaus in ihrer Berechnung.
Dennoch ist es richtig, dass sich hier einiges verbessern lässt, wenn man weiß wie auf die Daten zugegriffen wird.

Darum soll es allerdings hier noch nicht gehen. Es steht die Portierung von C nach C++ mit den dabei festgestellten Laufzeitunterschieden im Fokus.

Primäres Ziel ist es tatsächlich, herauszufinden, warum das C++ fast doppelt so lange Laufzeiten benötigt.

camper

Die Tatsache, dass in der C-Variante N eine bereits beim Compilieren bekannte Größe ist, dürfte eine große Rolle spielen. Nachdem all die stencil_*-Aufrufe vermutlich geinlined werden, hat es der Compiler hier möglicherweise einfacher, wenn es darum geht, Ausdrücke zusammenzufassen (kann ich nat. nicht einschätzen, ohne alle relevanten Funktionen gesehen zu haben). Zudem müssen etwa Ausrücke wir i + N gar nicht erst berechnet werden, wenn damit nur auf y zugegriffen wird, bei konstantem N kann das einfach als Offset in die Adressierung einfliessen.

Für ein equivalentes C++-Programm wäre es angemessen, dies ebenfalls konstant zu machen (z.B. als Templateparameter). Andernfalls vergleichst du durchaus unterschiedliche Programme und der Overhead, mit dem du zu kämpfen hast, hat absolut nichts mit Objektorientierung als solcher zu tun, sondern schlicht damit, dass die C++-Variante flexibler ist und mehr kann.

Bei Interesse kann ich die beiden Programme mal privat versenden, falls du mal drüber schauen willst.

Ich poste hier mal die beiden vollständigen assemblies von C und C++:

C:

ode_eval_comp:
0000000000401110:   mov     %edi,%eax
0000000000401112:   mov     $0x51eb851f,%edx
0000000000401117:   mov     %edi,%ecx
0000000000401119:   imul    %edx
000000000040111b:   mov     %edi,%eax
000000000040111d:   sar     $0x1f,%eax
0000000000401120:   sar     $0x5,%edx
0000000000401123:   sub     %eax,%edx
0000000000401125:   imul    $0x64,%edx,%eax
0000000000401128:   sub     %eax,%ecx
000000000040112a:   test    %edx,%edx
000000000040112c:   jle     0x4011f0 <ode_eval_comp+224>
0000000000401132:   cmp     $0x62,%edx
0000000000401135:   jg      0x401198 <ode_eval_comp+136>
0000000000401137:   test    %ecx,%ecx
0000000000401139:   jle     0x401338 <ode_eval_comp+552>
000000000040113f:   lea     -0x64(%rdi),%eax
0000000000401142:   cmp     $0x63,%ecx
0000000000401145:   cltq    
0000000000401147:   lea     0x0(,%rax,8),%rdx
000000000040114f:   je      0x401240 <ode_eval_comp+304>
0000000000401155:   movsd   0x318(%rsi,%rdx,1),%xmm0
000000000040115e:   movslq  %edi,%rdi
0000000000401161:   movsd   0xd09f(%rip),%xmm1        # 0x40e208
0000000000401169:   addsd   (%rsi,%rax,8),%xmm0
000000000040116e:   mulsd   0x320(%rsi,%rdx,1),%xmm1
0000000000401177:   subsd   %xmm1,%xmm0
000000000040117b:   addsd   0x8(%rsi,%rdi,8),%xmm0
0000000000401181:   addsd   0x320(%rsi,%rdi,8),%xmm0
000000000040118a:   mulsd   0xd07e(%rip),%xmm0        # 0x40e210
0000000000401192:   retq    
0000000000401193:   nopl    0x0(%rax,%rax,1)
0000000000401198:   test    %ecx,%ecx
000000000040119a:   jle     0x401308 <ode_eval_comp+504>
00000000004011a0:   cmp     $0x63,%ecx
00000000004011a3:   je      0x401280 <ode_eval_comp+368>
00000000004011a9:   lea     -0x64(%rdi),%eax
00000000004011ac:   movsd   0xd054(%rip),%xmm1        # 0x40e208
00000000004011b4:   movslq  %edi,%rdi
00000000004011b7:   cltq    
00000000004011b9:   movsd   (%rsi,%rax,8),%xmm0
00000000004011be:   lea     0x0(,%rax,8),%rdx
00000000004011c6:   addsd   %xmm0,%xmm0
00000000004011ca:   mulsd   0x320(%rsi,%rdx,1),%xmm1
00000000004011d3:   addsd   0x318(%rsi,%rdx,1),%xmm0
00000000004011dc:   subsd   %xmm1,%xmm0
00000000004011e0:   addsd   0x8(%rsi,%rdi,8),%xmm0
00000000004011e6:   mulsd   0xd022(%rip),%xmm0        # 0x40e210
00000000004011ee:   retq    
00000000004011ef:   nop     
00000000004011f0:   test    %ecx,%ecx
00000000004011f2:   jle     0x4012b0 <ode_eval_comp+416>
00000000004011f8:   cmp     $0x63,%ecx
00000000004011fb:   je      0x4012d8 <ode_eval_comp+456>
0000000000401201:   movsd   0xcfff(%rip),%xmm1        # 0x40e208
0000000000401209:   movslq  %edi,%rdi
000000000040120c:   movsd   -0x8(%rsi,%rdi,8),%xmm0
0000000000401212:   lea     0x1(%rdi),%rax
0000000000401216:   mulsd   (%rsi,%rdi,8),%xmm1
000000000040121b:   subsd   %xmm1,%xmm0
000000000040121f:   movsd   0x318(%rsi,%rax,8),%xmm1
0000000000401228:   addsd   %xmm1,%xmm1
000000000040122c:   addsd   (%rsi,%rax,8),%xmm0
0000000000401231:   addsd   %xmm1,%xmm0
0000000000401235:   mulsd   0xcfd3(%rip),%xmm0        # 0x40e210
000000000040123d:   retq    
000000000040123e:   xchg    %ax,%ax
0000000000401240:   movsd   0x318(%rsi,%rdx,1),%xmm0
0000000000401249:   add     $0x64,%edi
000000000040124c:   movsd   0xcfb4(%rip),%xmm1        # 0x40e208
0000000000401254:   movslq  %edi,%rdi
0000000000401257:   addsd   %xmm0,%xmm0
000000000040125b:   mulsd   0x320(%rsi,%rdx,1),%xmm1
0000000000401264:   addsd   (%rsi,%rax,8),%xmm0
0000000000401269:   subsd   %xmm1,%xmm0
000000000040126d:   addsd   (%rsi,%rdi,8),%xmm0
0000000000401272:   mulsd   0xcf96(%rip),%xmm0        # 0x40e210
000000000040127a:   retq    
000000000040127b:   nopl    0x0(%rax,%rax,1)
0000000000401280:   movsd   0x13558(%rsi),%xmm0
0000000000401288:   movsd   0x13878(%rsi),%xmm1
0000000000401290:   addsd   0x13870(%rsi),%xmm0
0000000000401298:   addsd   %xmm1,%xmm1
000000000040129c:   subsd   %xmm1,%xmm0
00000000004012a0:   mulsd   0xcf70(%rip),%xmm0        # 0x40e218
00000000004012a8:   retq    
00000000004012a9:   nopl    0x0(%rax)
00000000004012b0:   movsd   0xcf68(%rip),%xmm0        # 0x40e220
00000000004012b8:   mulsd   (%rsi),%xmm0
00000000004012bc:   addsd   0x8(%rsi),%xmm0
00000000004012c1:   addsd   0x320(%rsi),%xmm0
00000000004012c9:   mulsd   0xcf47(%rip),%xmm0        # 0x40e218
00000000004012d1:   retq    
00000000004012d2:   nopw    0x0(%rax,%rax,1)
00000000004012d8:   movsd   0x318(%rsi),%xmm0
00000000004012e0:   movapd  %xmm0,%xmm1
00000000004012e4:   addsd   %xmm0,%xmm1
00000000004012e8:   movsd   0x310(%rsi),%xmm0
00000000004012f0:   subsd   %xmm1,%xmm0
00000000004012f4:   addsd   0x638(%rsi),%xmm0
00000000004012fc:   mulsd   0xcf14(%rip),%xmm0        # 0x40e218
0000000000401304:   retq    
0000000000401305:   nopl    (%rax)
0000000000401308:   movsd   0x13560(%rsi),%xmm0
0000000000401310:   movapd  %xmm0,%xmm1
0000000000401314:   addsd   %xmm0,%xmm1
0000000000401318:   movsd   0x13240(%rsi),%xmm0
0000000000401320:   subsd   %xmm1,%xmm0
0000000000401324:   addsd   0x13568(%rsi),%xmm0
000000000040132c:   mulsd   0xcee4(%rip),%xmm0        # 0x40e218
0000000000401334:   retq    
0000000000401335:   nopl    (%rax)
0000000000401338:   sub     $0x64,%edi
000000000040133b:   movslq  %edi,%rdi
000000000040133e:   lea     0x0(,%rdi,8),%rax
0000000000401346:   movsd   0x640(%rsi,%rax,1),%xmm0
000000000040134f:   movsd   0x328(%rsi,%rax,1),%xmm1
0000000000401358:   addsd   (%rsi,%rdi,8),%xmm0
000000000040135d:   addsd   %xmm1,%xmm1
0000000000401361:   addsd   %xmm1,%xmm0
0000000000401365:   movsd   0xce9b(%rip),%xmm1        # 0x40e208
000000000040136d:   mulsd   0x320(%rsi,%rax,1),%xmm1
0000000000401376:   subsd   %xmm1,%xmm0
000000000040137a:   mulsd   0xce8e(%rip),%xmm0        # 0x40e210
0000000000401382:   retq    
0000000000401383:   nopl    (%rax)
0000000000401386:   nopw    %cs:0x0(%rax,%rax,1)

C++

Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&):
0000000000401820:   mov     0x38(%rdi),%rcx
0000000000401824:   mov     %rdx,%r8
0000000000401827:   movslq  %esi,%rax
000000000040182a:   xor     %edx,%edx
000000000040182c:   div     %rcx
000000000040182f:   test    %eax,%eax
0000000000401831:   jle     0x401920 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+256>
0000000000401837:   lea     -0x1(%rcx),%rdi
000000000040183b:   cltq    
000000000040183d:   cmp     %rdi,%rax
0000000000401840:   jae     0x4018b0 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+144>
0000000000401842:   test    %edx,%edx
0000000000401844:   jle     0x401af8 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+728>
000000000040184a:   pxor    %xmm1,%xmm1
000000000040184e:   movslq  %edx,%rax
0000000000401851:   lea     -0x1(%rsi),%edx
0000000000401854:   cmp     %rax,%rdi
0000000000401857:   mov     (%r8),%rax
000000000040185a:   movslq  %edx,%rdx
000000000040185d:   cvtsi2sd        %ecx,%xmm1
0000000000401861:   subsd   0x6787(%rip),%xmm1        # 0x407ff0
0000000000401869:   ja      0x401998 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+376>
000000000040186f:   movsd   (%rax,%rdx,8),%xmm0
0000000000401874:   mov     %esi,%edi
0000000000401876:   sub     %ecx,%edi
0000000000401878:   movsd   0x6778(%rip),%xmm2        # 0x407ff8
0000000000401880:   addsd   %xmm0,%xmm0
0000000000401884:   movslq  %edi,%rdi
0000000000401887:   mulsd   0x8(%rax,%rdx,8),%xmm2
000000000040188d:   addsd   (%rax,%rdi,8),%xmm0
0000000000401892:   subsd   %xmm2,%xmm0
0000000000401896:   add     %esi,%ecx
0000000000401898:   mulsd   %xmm1,%xmm1
000000000040189c:   movslq  %ecx,%rcx
000000000040189f:   addsd   (%rax,%rcx,8),%xmm0
00000000004018a4:   mulsd   %xmm1,%xmm0
00000000004018a8:   retq    
00000000004018a9:   nopl    0x0(%rax)
00000000004018b0:   test    %edx,%edx
00000000004018b2:   jle     0x401aa0 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+640>
00000000004018b8:   movslq  %edx,%rax
00000000004018bb:   cmp     %rax,%rdi
00000000004018be:   jbe     0x4019d0 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+432>
00000000004018c4:   mov     (%r8),%rax
00000000004018c7:   lea     -0x1(%rsi),%edx
00000000004018ca:   sub     %ecx,%esi
00000000004018cc:   pxor    %xmm1,%xmm1
00000000004018d0:   movslq  %esi,%rsi
00000000004018d3:   movslq  %edx,%rdx
00000000004018d6:   movsd   0x671a(%rip),%xmm2        # 0x407ff8
00000000004018de:   movsd   (%rax,%rsi,8),%xmm0
00000000004018e3:   lea     0x0(,%rdx,8),%rdi
00000000004018eb:   cvtsi2sd        %ecx,%xmm1
00000000004018ef:   addsd   %xmm0,%xmm0
00000000004018f3:   mulsd   0x8(%rax,%rdi,1),%xmm2
00000000004018f9:   addsd   (%rax,%rdx,8),%xmm0
00000000004018fe:   subsd   0x66ea(%rip),%xmm1        # 0x407ff0
0000000000401906:   subsd   %xmm2,%xmm0
000000000040190a:   addsd   0x10(%rax,%rdi,1),%xmm0
0000000000401910:   mulsd   %xmm1,%xmm1
0000000000401914:   mulsd   %xmm1,%xmm0
0000000000401918:   retq    
0000000000401919:   nopl    0x0(%rax)
0000000000401920:   test    %edx,%edx
0000000000401922:   jle     0x401a20 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+512>
0000000000401928:   pxor    %xmm1,%xmm1
000000000040192c:   movslq  %edx,%rax
000000000040192f:   lea     -0x1(%rcx),%rdx
0000000000401933:   cmp     %rdx,%rax
0000000000401936:   cvtsi2sd        %ecx,%xmm1
000000000040193a:   subsd   0x66ae(%rip),%xmm1        # 0x407ff0
0000000000401942:   jae     0x401a60 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+576>
0000000000401948:   lea     -0x1(%rsi),%edx
000000000040194b:   mov     (%r8),%rax
000000000040194e:   add     %esi,%ecx
0000000000401950:   movsd   0x66a0(%rip),%xmm2        # 0x407ff8
0000000000401958:   movslq  %ecx,%rcx
000000000040195b:   movslq  %edx,%rdx
000000000040195e:   mulsd   %xmm1,%xmm1
0000000000401962:   lea     0x0(,%rdx,8),%rdi
000000000040196a:   movsd   (%rax,%rdx,8),%xmm0
000000000040196f:   mulsd   0x8(%rax,%rdi,1),%xmm2
0000000000401975:   subsd   %xmm2,%xmm0
0000000000401979:   movsd   (%rax,%rcx,8),%xmm2
000000000040197e:   addsd   %xmm2,%xmm2
0000000000401982:   addsd   0x10(%rax,%rdi,1),%xmm0
0000000000401988:   addsd   %xmm2,%xmm0
000000000040198c:   mulsd   %xmm1,%xmm0
0000000000401990:   retq    
0000000000401991:   nopl    0x0(%rax)
0000000000401998:   mov     %esi,%edi
000000000040199a:   lea     0x0(,%rdx,8),%r8
00000000004019a2:   sub     %ecx,%edi
00000000004019a4:   movsd   0x664c(%rip),%xmm2        # 0x407ff8
00000000004019ac:   movslq  %edi,%rdi
00000000004019af:   movsd   (%rax,%rdi,8),%xmm0
00000000004019b4:   mulsd   0x8(%rax,%r8,1),%xmm2
00000000004019bb:   addsd   (%rax,%rdx,8),%xmm0
00000000004019c0:   subsd   %xmm2,%xmm0
00000000004019c4:   addsd   0x10(%rax,%r8,1),%xmm0
00000000004019cb:   jmpq    0x401896 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+118>
00000000004019d0:   pxor    %xmm1,%xmm1
00000000004019d4:   mov     %ecx,%eax
00000000004019d6:   imul    %ecx,%eax
00000000004019d9:   mov     (%r8),%rsi
00000000004019dc:   cvtsi2sd        %ecx,%xmm1
00000000004019e0:   lea     -0x2(%rax),%edx
00000000004019e3:   sub     $0x1,%eax
00000000004019e6:   sub     %ecx,%eax
00000000004019e8:   cltq    
00000000004019ea:   movslq  %edx,%rdx
00000000004019ed:   movsd   (%rsi,%rax,8),%xmm0
00000000004019f2:   movsd   0x8(%rsi,%rdx,8),%xmm2
00000000004019f8:   addsd   (%rsi,%rdx,8),%xmm0
00000000004019fd:   addsd   %xmm2,%xmm2
0000000000401a01:   subsd   0x65e7(%rip),%xmm1        # 0x407ff0
0000000000401a09:   subsd   %xmm2,%xmm0
0000000000401a0d:   mulsd   %xmm1,%xmm1
0000000000401a11:   addsd   %xmm1,%xmm1
0000000000401a15:   mulsd   %xmm1,%xmm0
0000000000401a19:   retq    
0000000000401a1a:   nopw    0x0(%rax,%rax,1)
0000000000401a20:   pxor    %xmm1,%xmm1
0000000000401a24:   mov     (%r8),%rax
0000000000401a27:   movsd   0x65d1(%rip),%xmm0        # 0x408000
0000000000401a2f:   cvtsi2sd        %ecx,%xmm1
0000000000401a33:   mulsd   (%rax),%xmm0
0000000000401a37:   movslq  %ecx,%rcx
0000000000401a3a:   addsd   0x8(%rax),%xmm0
0000000000401a3f:   subsd   0x65a9(%rip),%xmm1        # 0x407ff0
0000000000401a47:   addsd   (%rax,%rcx,8),%xmm0
0000000000401a4c:   mulsd   %xmm1,%xmm1
0000000000401a50:   addsd   %xmm1,%xmm1
0000000000401a54:   mulsd   %xmm1,%xmm0
0000000000401a58:   retq    
0000000000401a59:   nopl    0x0(%rax)
0000000000401a60:   mov     (%r8),%rdx
0000000000401a63:   lea     -0x2(%rcx),%eax
0000000000401a66:   mulsd   %xmm1,%xmm1
0000000000401a6a:   cltq    
0000000000401a6c:   movsd   0x8(%rdx,%rax,8),%xmm0
0000000000401a72:   addsd   %xmm1,%xmm1
0000000000401a76:   movapd  %xmm0,%xmm2
0000000000401a7a:   addsd   %xmm0,%xmm2
0000000000401a7e:   movsd   (%rdx,%rax,8),%xmm0
0000000000401a83:   lea     -0x1(%rcx,%rcx,1),%eax
0000000000401a87:   cltq    
0000000000401a89:   subsd   %xmm2,%xmm0
0000000000401a8d:   addsd   (%rdx,%rax,8),%xmm0
0000000000401a92:   mulsd   %xmm1,%xmm0
0000000000401a96:   retq    
0000000000401a97:   nopw    0x0(%rax,%rax,1)
0000000000401aa0:   pxor    %xmm1,%xmm1
0000000000401aa4:   mov     %ecx,%eax
0000000000401aa6:   imul    %ecx,%eax
0000000000401aa9:   lea     (%rcx,%rcx,1),%edx
0000000000401aac:   cvtsi2sd        %ecx,%xmm1
0000000000401ab0:   sub     %edx,%eax
0000000000401ab2:   mov     (%r8),%rdx
0000000000401ab5:   add     %eax,%ecx
0000000000401ab7:   cltq    
0000000000401ab9:   movslq  %ecx,%rsi
0000000000401abc:   add     $0x1,%ecx
0000000000401abf:   movsd   (%rdx,%rsi,8),%xmm0
0000000000401ac4:   movslq  %ecx,%rcx
0000000000401ac7:   movapd  %xmm0,%xmm2
0000000000401acb:   addsd   %xmm0,%xmm2
0000000000401acf:   movsd   (%rdx,%rax,8),%xmm0
0000000000401ad4:   subsd   0x6514(%rip),%xmm1        # 0x407ff0
0000000000401adc:   subsd   %xmm2,%xmm0
0000000000401ae0:   addsd   (%rdx,%rcx,8),%xmm0
0000000000401ae5:   mulsd   %xmm1,%xmm1
0000000000401ae9:   addsd   %xmm1,%xmm1
0000000000401aed:   mulsd   %xmm1,%xmm0
0000000000401af1:   retq    
0000000000401af2:   nopw    0x0(%rax,%rax,1)
0000000000401af8:   pxor    %xmm1,%xmm1
0000000000401afc:   mov     (%r8),%rax
0000000000401aff:   mov     %esi,%edi
0000000000401b01:   lea     0x1(%rsi),%edx
0000000000401b04:   sub     %ecx,%edi
0000000000401b06:   cvtsi2sd        %ecx,%xmm1
0000000000401b0a:   movslq  %edi,%rdi
0000000000401b0d:   add     %esi,%ecx
0000000000401b0f:   movsd   (%rax,%rdi,8),%xmm0
0000000000401b14:   movslq  %edx,%rdx
0000000000401b17:   movslq  %ecx,%rcx
0000000000401b1a:   movsd   (%rax,%rdx,8),%xmm2
0000000000401b1f:   addsd   (%rax,%rcx,8),%xmm0
0000000000401b24:   addsd   %xmm2,%xmm2
0000000000401b28:   addsd   %xmm2,%xmm0
0000000000401b2c:   movsd   0x64c4(%rip),%xmm2        # 0x407ff8
0000000000401b34:   subsd   0x64b4(%rip),%xmm1        # 0x407ff0
0000000000401b3c:   mulsd   -0x8(%rax,%rdx,8),%xmm2
0000000000401b42:   mulsd   %xmm1,%xmm1
0000000000401b46:   subsd   %xmm2,%xmm0
0000000000401b4a:   mulsd   %xmm1,%xmm0
0000000000401b4e:   retq    
0000000000401b4f:   nop

camper

auseinandergenommen (jeweils nur der Zweig, der zu stencil_inner führt):

C:

ode_eval_comp:                                     # edi = i, xmm0 = t, rsi = y
0000000000401110:   mov     %edi,%eax                                # eax = i
0000000000401112:   mov     $0x51eb851f,%edx                         # edx = (1ull << 37) / N     N == 100
0000000000401117:   mov     %edi,%ecx                                # ecx = i
0000000000401119:   imul    %edx                                     # edx = ri << 5, #eax
000000000040111b:   mov     %edi,%eax                                # eax = i
000000000040111d:   sar     $0x1f,%eax                               # eax = sgn(i)
0000000000401120:   sar     $0x5,%edx                                # edx = ri == i / N
0000000000401123:   sub     %eax,%edx                                # edx = ri-sgn(i)
0000000000401125:   imul    $0x64,%edx,%eax                          # eax = 100 * ri == i / N * N == i - i % N
0000000000401128:   sub     %eax,%ecx                                # ecx = ci == i % N
000000000040112a:   test    %edx,%edx                                # test(ri)
000000000040112c:   jle     0x4011f0 <ode_eval_comp+224>             # <=0: jmp else
0000000000401132:   cmp     $0x62,%edx                               # 98 vs. ri    (N-2 vs. ri)
0000000000401135:   jg      0x401198 <ode_eval_comp+136>             # ri > N-2: jmp else
0000000000401137:   test    %ecx,%ecx                                # test(ci)
0000000000401139:   jle     0x401338 <ode_eval_comp+552>             # <=0: jmp else
000000000040113f:   lea     -0x64(%rdi),%eax                         # eax = i - N
0000000000401142:   cmp     $0x63,%ecx                               # N-1 vs. ci
0000000000401145:   cltq                                             # rax = i - N (sign-extended)
0000000000401147:   lea     0x0(,%rax,8),%rdx                        # rdx = sizeof(double)*(i-N)
000000000040114f:   je      0x401240 <ode_eval_comp+304>             # ci == N-1: jmp else
0000000000401155:   movsd   0x318(%rsi,%rdx,1),%xmm0                 # xmm0 = y[i - N + (N-1)] ; == y[i-1]
000000000040115e:   movslq  %edi,%rdi                                # rdi = i (sign-extended)
0000000000401161:   movsd   0xd09f(%rip),%xmm1        # 0x40e208     # xmm1 = const            ;?== 4
0000000000401169:   addsd   (%rsi,%rax,8),%xmm0                      # xmm0 += y[i - N]
000000000040116e:   mulsd   0x320(%rsi,%rdx,1),%xmm1                 # xmm1 *= y[i - N + N]    ; == y[i]
0000000000401177:   subsd   %xmm1,%xmm0                              # xmm0 -= xmm1
000000000040117b:   addsd   0x8(%rsi,%rdi,8),%xmm0                   # xmm0 += y[i + 1]
0000000000401181:   addsd   0x320(%rsi,%rdi,8),%xmm0                 # xmm0 += y[i + N]
000000000040118a:   mulsd   0xd07e(%rip),%xmm0        # 0x40e210     # xmm0 *= const           ;?== sqr(N-1)
0000000000401192:   retq                                             # retval: xmm0
#mulsd:    2
#addsd:    3
#subsd:    1
#cvtsi2sd: 0

C++

Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&):
                                                                     # rdi = this, esi = i, xmm0 = t, rdx = &y
0000000000401820:   mov     0x38(%rdi),%rcx                          # rcx = N                 ;? offset 56
0000000000401824:   mov     %rdx,%r8                                 # r8 = &y
0000000000401827:   movslq  %esi,%rax                                # rax = i
000000000040182a:   xor    %edx,%edx                                 # rdx = 0
000000000040182c:   div     %rcx                                     # rax = ri == i / N, rdx = ci == i % N
000000000040182f:   test    %eax,%eax                                # test(ri)
0000000000401831:   jle     0x401920 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+256>
                                                                     # ri <= 0: jmp else
0000000000401837:   lea     -0x1(%rcx),%rdi                          # rdi = N - 1
000000000040183b:   cltq                                             # rax = ri    ??redundant
000000000040183d:   cmp     %rdi,%rax                                # ri vs. N - 1 
0000000000401840:   jae     0x4018b0 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+144>
                                                                     # ri >= N - 1 : jmp else
0000000000401842:   test    %edx,%edx                                # test(ci)
0000000000401844:   jle     0x401af8 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+728>
                                                                     # ci <= 0: jmp else
000000000040184a:   pxor    %xmm1,%xmm1                              # xmm1 = 0
000000000040184e:   movslq  %edx,%rax                                # rax = ci
0000000000401851:   lea     -0x1(%rsi),%edx                          # edx = i - 1
0000000000401854:   cmp     %rax,%rdi                                # ci vs. N - 1
0000000000401857:   mov     (%r8),%rax                               # rax = &y[0]
000000000040185a:   movslq  %edx,%rdx                                # rdx = i - 1
000000000040185d:   cvtsi2sd        %ecx,%xmm1                       # xmm1 = N    (int->double)
0000000000401861:   subsd   0x6787(%rip),%xmm1        # 0x407ff0     # xmm1 -= const        ;? == 1
0000000000401869:   ja      0x401998 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+376>
                                                                     # N - 1 > ci: jmp if
...
0000000000401998:   mov     %esi,%edi                                # edi = i
000000000040199a:   lea     0x0(,%rdx,8),%r8                         # r8 = sizeof(double)*(i-1)
00000000004019a2:   sub     %ecx,%edi                                # edi = i - N
00000000004019a4:   movsd   0x664c(%rip),%xmm2        # 0x407ff8     # xmm2 = const         ;? == 4
00000000004019ac:   movslq  %edi,%rdi                                # rdi = i - N
00000000004019af:   movsd   (%rax,%rdi,8),%xmm0                      # xmm0 = y[i - N]
00000000004019b4:   mulsd   0x8(%rax,%r8,1),%xmm2                    # xmm2 *= y[i]
00000000004019bb:   addsd   (%rax,%rdx,8),%xmm0                      # xmm0 += y[i-1]
00000000004019c0:   subsd   %xmm2,%xmm0                              # xmm0 -= xmm2
00000000004019c4:   addsd   0x10(%rax,%r8,1),%xmm0                   # xmm0 += y[i-1+2]     ; == y[i+1]
00000000004019cb:   jmpq    0x401896 <Heat2D<double, std::vector<double, std::allocator<double> > >::eval_component(int, double, std::vector<double, std::allocator<double> > const&)+118>
...
0000000000401896:   add     %esi,%ecx                                # ecx = i + N
0000000000401898:   mulsd   %xmm1,%xmm1                              # xmm1 *= xmm1         ; sqr(N-1)
000000000040189c:   movslq  %ecx,%rcx                                # ecx = i + N
000000000040189f:   addsd   (%rax,%rcx,8),%xmm0                      # xmm0 += y[i+N]
00000000004018a4:   mulsd   %xmm1,%xmm0                              # xmm0 *= xmm1
00000000004018a8:   retq                                             # retval: xmm0

#mulsd:    3
#addsd:    3
#subsd:    2
#cvtsi2sd: 1

Die Konstanz von N macht den Unterschied, der Rest ist im Grund gleich. Beide Varianten würden wahrscheinlich davon profitieren, wenn i und N vorzeichenlos wären, in diesem Fall würden einige sign-extends (cltq, movslq) überflüssig.

Wow, vielen Dank für die ausführliche Antwort. Noch habe ich den kommentierten Assembler-Code nicht genau studiert, aber ich habe im C++-Programm nun auch eine Wärmeleitungsgleichung mit konstanter Größe implementiert und die Laufzeit kann nun mit dem C-Programm mithalten.

Wie kamst du darauf, dass die konstante Größe der ausschlaggebende Punkt ist?
Sieht man das an bestimmten Zeilen im Assembly oder war das eine Vermutung/Erfahrungssache?

camper

Es ist naheliegend, da N als Teil der Berechnung auftaucht. Natürlich bedeutet es weniger Arbeit zu Laufzeit, wenn der Compiler einen Teil der Berechnung bereits während des Compilierens durchführen kann.

Wobei ich immer noch davon ausgehe, dass die Funktion selbst gar nicht das Problem ist, sondern eher die Tatsache, dass sie überhaupt so aufgerufen wird.