Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:97-100 | Coverage: 0.01% |
---|
Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:97-100 | Coverage: 0.01% |
---|
/scratch_na/users/xoserete/qaas_runs/171-320-5323/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/update_halo.cpp: 97 - 100 |
-------------------------------------------------------------------------------- |
97: #pragma omp parallel for simd |
98: for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { |
99: for (int k = 0; k < depth; ++k) { |
100: field.density1(j, y_max + 2 + k) = field.density1(j, y_max + 1 - k); |
0x4331b0 PUSH %RBP |
0x4331b1 MOV %RSP,%RBP |
0x4331b4 PUSH %R14 |
0x4331b6 PUSH %R13 |
0x4331b8 MOV %RDI,%R13 |
0x4331bb PUSH %R12 |
0x4331bd PUSH %RBX |
0x4331be MOV 0x14(%RDI),%EBX |
0x4331c1 MOV 0x8(%RDI),%R8D |
0x4331c5 SUB %EBX,%R8D |
0x4331c8 LEA 0x1(%R8),%R14D |
0x4331cc CALL 4046c0 <omp_get_num_threads@plt> |
0x4331d1 MOV %EAX,%R12D |
0x4331d4 CALL 4045b0 <omp_get_thread_num@plt> |
0x4331d9 MOV %EAX,%ECX |
0x4331db MOV 0xc(%R13),%EAX |
0x4331df ADD %EBX,%EAX |
0x4331e1 ADD $0x2,%EAX |
0x4331e4 SUB %R14D,%EAX |
0x4331e7 CLTD |
0x4331e8 IDIV %R12D |
0x4331eb CMP %EDX,%ECX |
0x4331ed JL 4333b0 |
0x4331f3 IMUL %EAX,%ECX |
0x4331f6 ADD %ECX,%EDX |
0x4331f8 ADD %EDX,%EAX |
0x4331fa CMP %EAX,%EDX |
0x4331fc JGE 4333a7 |
0x433202 LEA (%R14,%RDX,1),%EDI |
0x433206 MOV 0x10(%R13),%R10D |
0x43320a MOV (%R13),%R11 |
0x43320e LEA (%R14,%RAX,1),%R8D |
0x433212 MOVSXD %EDI,%RSI |
0x433215 TEST %EBX,%EBX |
0x433217 JLE 4333a7 |
0x43321d MOV 0x18(%R11),%R14 |
0x433221 LEA 0x1(%R10),%R13D |
0x433225 MOV 0x28(%R11),%R9 |
0x433229 SUB %ESI,%EDI |
0x43322b MOVSXD %R13D,%R10 |
0x43322e IMUL %R14,%R10 |
0x433232 MOV %R14,%RDX |
0x433235 NEG %RDX |
0x433238 SAL $0x3,%RDX |
0x43323c ADD %R10,%R14 |
0x43323f NOP |
(228) 0x433240 LEA (%RSI,%R10,1),%R12 |
(228) 0x433244 LEA (%R14,%RSI,1),%RCX |
(228) 0x433248 MOV %EBX,%R13D |
(228) 0x43324b LEA (%R9,%R12,8),%R11 |
(228) 0x43324f LEA (%R9,%RCX,8),%RAX |
(228) 0x433253 XOR %R12D,%R12D |
(228) 0x433256 AND $0x7,%R13D |
(228) 0x43325a JE 433312 |
(228) 0x433260 CMP $0x1,%R13D |
(228) 0x433264 JE 4332f7 |
(228) 0x43326a CMP $0x2,%R13D |
(228) 0x43326e JE 4332e5 |
(228) 0x433270 CMP $0x3,%R13D |
(228) 0x433274 JE 4332d3 |
(228) 0x433276 CMP $0x4,%R13D |
(228) 0x43327a JE 4332c1 |
(228) 0x43327c CMP $0x5,%R13D |
(228) 0x433280 JE 4332af |
(228) 0x433282 CMP $0x6,%R13D |
(228) 0x433286 JE 43329d |
(228) 0x433288 VMOVSD (%R11),%XMM0 |
(228) 0x43328d MOV $0x1,%R12D |
(228) 0x433293 ADD %RDX,%R11 |
(228) 0x433296 VMOVSD %XMM0,(%RAX) |
(228) 0x43329a SUB %RDX,%RAX |
(228) 0x43329d VMOVSD (%R11),%XMM1 |
(228) 0x4332a2 INC %R12D |
(228) 0x4332a5 ADD %RDX,%R11 |
(228) 0x4332a8 VMOVSD %XMM1,(%RAX) |
(228) 0x4332ac SUB %RDX,%RAX |
(228) 0x4332af VMOVSD (%R11),%XMM2 |
(228) 0x4332b4 INC %R12D |
(228) 0x4332b7 ADD %RDX,%R11 |
(228) 0x4332ba VMOVSD %XMM2,(%RAX) |
(228) 0x4332be SUB %RDX,%RAX |
(228) 0x4332c1 VMOVSD (%R11),%XMM3 |
(228) 0x4332c6 INC %R12D |
(228) 0x4332c9 ADD %RDX,%R11 |
(228) 0x4332cc VMOVSD %XMM3,(%RAX) |
(228) 0x4332d0 SUB %RDX,%RAX |
(228) 0x4332d3 VMOVSD (%R11),%XMM4 |
(228) 0x4332d8 INC %R12D |
(228) 0x4332db ADD %RDX,%R11 |
(228) 0x4332de VMOVSD %XMM4,(%RAX) |
(228) 0x4332e2 SUB %RDX,%RAX |
(228) 0x4332e5 VMOVSD (%R11),%XMM5 |
(228) 0x4332ea INC %R12D |
(228) 0x4332ed ADD %RDX,%R11 |
(228) 0x4332f0 VMOVSD %XMM5,(%RAX) |
(228) 0x4332f4 SUB %RDX,%RAX |
(228) 0x4332f7 VMOVSD (%R11),%XMM6 |
(228) 0x4332fc INC %R12D |
(228) 0x4332ff ADD %RDX,%R11 |
(228) 0x433302 VMOVSD %XMM6,(%RAX) |
(228) 0x433306 SUB %RDX,%RAX |
(228) 0x433309 CMP %R12D,%EBX |
(228) 0x43330c JE 433397 |
(227) 0x433312 VMOVSD (%R11),%XMM7 |
(227) 0x433317 ADD %RDX,%R11 |
(227) 0x43331a ADD $0x8,%R12D |
(227) 0x43331e VMOVSD %XMM7,(%RAX) |
(227) 0x433322 SUB %RDX,%RAX |
(227) 0x433325 VMOVSD (%R11),%XMM8 |
(227) 0x43332a ADD %RDX,%R11 |
(227) 0x43332d VMOVSD %XMM8,(%RAX) |
(227) 0x433331 SUB %RDX,%RAX |
(227) 0x433334 VMOVSD (%R11),%XMM9 |
(227) 0x433339 ADD %RDX,%R11 |
(227) 0x43333c VMOVSD %XMM9,(%RAX) |
(227) 0x433340 SUB %RDX,%RAX |
(227) 0x433343 VMOVSD (%R11),%XMM10 |
(227) 0x433348 ADD %RDX,%R11 |
(227) 0x43334b VMOVSD %XMM10,(%RAX) |
(227) 0x43334f SUB %RDX,%RAX |
(227) 0x433352 VMOVSD (%R11),%XMM11 |
(227) 0x433357 ADD %RDX,%R11 |
(227) 0x43335a VMOVSD %XMM11,(%RAX) |
(227) 0x43335e SUB %RDX,%RAX |
(227) 0x433361 VMOVSD (%R11),%XMM12 |
(227) 0x433366 ADD %RDX,%R11 |
(227) 0x433369 VMOVSD %XMM12,(%RAX) |
(227) 0x43336d SUB %RDX,%RAX |
(227) 0x433370 VMOVSD (%R11),%XMM13 |
(227) 0x433375 ADD %RDX,%R11 |
(227) 0x433378 VMOVSD %XMM13,(%RAX) |
(227) 0x43337c SUB %RDX,%RAX |
(227) 0x43337f VMOVSD (%R11),%XMM14 |
(227) 0x433384 ADD %RDX,%R11 |
(227) 0x433387 VMOVSD %XMM14,(%RAX) |
(227) 0x43338b SUB %RDX,%RAX |
(227) 0x43338e CMP %R12D,%EBX |
(227) 0x433391 JNE 433312 |
(228) 0x433397 INC %RSI |
(228) 0x43339a LEA (%RDI,%RSI,1),%R11D |
(228) 0x43339e CMP %R11D,%R8D |
(228) 0x4333a1 JG 433240 |
0x4333a7 POP %RBX |
0x4333a8 POP %R12 |
0x4333aa POP %R13 |
0x4333ac POP %R14 |
0x4333ae POP %RBP |
0x4333af RET |
0x4333b0 INC %EAX |
0x4333b2 XOR %EDX,%EDX |
0x4333b4 JMP 4331f3 |
0x4333b9 NOPL (%RAX) |
Path / |
Source file and lines | update_halo.cpp:97-100 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 169 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 6.00 | 6.00 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
cycles | 6.00 | 9.07 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 16.79-16.94 |
Stall cycles | 6.68-6.83 |
LM full (events) | 8.27-8.42 |
Front-end | 10.17 |
Dispatch | 9.07 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 4333b0 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x200> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 4333a7 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R14,%RDX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x10(%R13),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOVSXD %EDI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 4333a7 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x18(%R11),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R10),%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x28(%R11),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %ESI,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOVSXD %R13D,%R10 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
IMUL %R14,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R10,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4331f3 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x43> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | update_halo.cpp:97-100 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 169 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 6.00 | 6.00 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
cycles | 6.00 | 9.07 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 16.79-16.94 |
Stall cycles | 6.68-6.83 |
LM full (events) | 8.27-8.42 |
Front-end | 10.17 |
Dispatch | 9.07 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 4333b0 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x200> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 4333a7 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R14,%RDX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x10(%R13),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOVSXD %EDI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 4333a7 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x18(%R11),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R10),%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x28(%R11),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %ESI,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOVSXD %R13D,%R10 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
IMUL %R14,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R10,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4331f3 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.5+0x43> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼update_halo_kernel(int, int, int, int, std::array | 0.01 | 0 |
▼Loop 228 - update_halo.cpp:99-100 - exec– | 0.01 | 0.01 |
○Loop 227 - update_halo.cpp:99-100 - exec | 0 | 0 |