Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:183-186 | Coverage: 0.01% |
---|
Function: update_halo_kernel(int, int, int, int, std::array<int, 4ul> const&, std::array<int, 4ul> c ... | Module: exec | Source: update_halo.cpp:183-186 | Coverage: 0.01% |
---|
/scratch_na/users/xoserete/qaas_runs/171-291-1831/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/update_halo.cpp: 183 - 186 |
-------------------------------------------------------------------------------- |
183: #pragma omp parallel for simd |
184: for (int j = (x_min - depth + 1); j < (x_max + depth + 2); j++) { |
185: for (int k = 0; k < depth; ++k) { |
186: field.energy1(j, y_max + 2 + k) = field.energy1(j, y_max + 1 - k); |
0x434510 PUSH %RBP |
0x434511 MOV %RSP,%RBP |
0x434514 PUSH %R14 |
0x434516 PUSH %R13 |
0x434518 MOV %RDI,%R13 |
0x43451b PUSH %R12 |
0x43451d PUSH %RBX |
0x43451e MOV 0x14(%RDI),%EBX |
0x434521 MOV 0x8(%RDI),%R8D |
0x434525 SUB %EBX,%R8D |
0x434528 LEA 0x1(%R8),%R14D |
0x43452c CALL 4046c0 <omp_get_num_threads@plt> |
0x434531 MOV %EAX,%R12D |
0x434534 CALL 4045b0 <omp_get_thread_num@plt> |
0x434539 MOV %EAX,%ECX |
0x43453b MOV 0xc(%R13),%EAX |
0x43453f ADD %EBX,%EAX |
0x434541 ADD $0x2,%EAX |
0x434544 SUB %R14D,%EAX |
0x434547 CLTD |
0x434548 IDIV %R12D |
0x43454b CMP %EDX,%ECX |
0x43454d JL 434710 |
0x434553 IMUL %EAX,%ECX |
0x434556 ADD %ECX,%EDX |
0x434558 ADD %EDX,%EAX |
0x43455a CMP %EAX,%EDX |
0x43455c JGE 434707 |
0x434562 LEA (%R14,%RDX,1),%EDI |
0x434566 MOV 0x10(%R13),%R10D |
0x43456a MOV (%R13),%R11 |
0x43456e LEA (%R14,%RAX,1),%R8D |
0x434572 MOVSXD %EDI,%RSI |
0x434575 TEST %EBX,%EBX |
0x434577 JLE 434707 |
0x43457d MOV 0x48(%R11),%R14 |
0x434581 LEA 0x1(%R10),%R13D |
0x434585 MOV 0x58(%R11),%R9 |
0x434589 SUB %ESI,%EDI |
0x43458b MOVSXD %R13D,%R10 |
0x43458e IMUL %R14,%R10 |
0x434592 MOV %R14,%RDX |
0x434595 NEG %RDX |
0x434598 SAL $0x3,%RDX |
0x43459c ADD %R10,%R14 |
0x43459f NOP |
(248) 0x4345a0 LEA (%RSI,%R10,1),%R12 |
(248) 0x4345a4 LEA (%R14,%RSI,1),%RCX |
(248) 0x4345a8 MOV %EBX,%R13D |
(248) 0x4345ab LEA (%R9,%R12,8),%R11 |
(248) 0x4345af LEA (%R9,%RCX,8),%RAX |
(248) 0x4345b3 XOR %R12D,%R12D |
(248) 0x4345b6 AND $0x7,%R13D |
(248) 0x4345ba JE 434672 |
(248) 0x4345c0 CMP $0x1,%R13D |
(248) 0x4345c4 JE 434657 |
(248) 0x4345ca CMP $0x2,%R13D |
(248) 0x4345ce JE 434645 |
(248) 0x4345d0 CMP $0x3,%R13D |
(248) 0x4345d4 JE 434633 |
(248) 0x4345d6 CMP $0x4,%R13D |
(248) 0x4345da JE 434621 |
(248) 0x4345dc CMP $0x5,%R13D |
(248) 0x4345e0 JE 43460f |
(248) 0x4345e2 CMP $0x6,%R13D |
(248) 0x4345e6 JE 4345fd |
(248) 0x4345e8 VMOVSD (%R11),%XMM0 |
(248) 0x4345ed MOV $0x1,%R12D |
(248) 0x4345f3 ADD %RDX,%R11 |
(248) 0x4345f6 VMOVSD %XMM0,(%RAX) |
(248) 0x4345fa SUB %RDX,%RAX |
(248) 0x4345fd VMOVSD (%R11),%XMM1 |
(248) 0x434602 INC %R12D |
(248) 0x434605 ADD %RDX,%R11 |
(248) 0x434608 VMOVSD %XMM1,(%RAX) |
(248) 0x43460c SUB %RDX,%RAX |
(248) 0x43460f VMOVSD (%R11),%XMM2 |
(248) 0x434614 INC %R12D |
(248) 0x434617 ADD %RDX,%R11 |
(248) 0x43461a VMOVSD %XMM2,(%RAX) |
(248) 0x43461e SUB %RDX,%RAX |
(248) 0x434621 VMOVSD (%R11),%XMM3 |
(248) 0x434626 INC %R12D |
(248) 0x434629 ADD %RDX,%R11 |
(248) 0x43462c VMOVSD %XMM3,(%RAX) |
(248) 0x434630 SUB %RDX,%RAX |
(248) 0x434633 VMOVSD (%R11),%XMM4 |
(248) 0x434638 INC %R12D |
(248) 0x43463b ADD %RDX,%R11 |
(248) 0x43463e VMOVSD %XMM4,(%RAX) |
(248) 0x434642 SUB %RDX,%RAX |
(248) 0x434645 VMOVSD (%R11),%XMM5 |
(248) 0x43464a INC %R12D |
(248) 0x43464d ADD %RDX,%R11 |
(248) 0x434650 VMOVSD %XMM5,(%RAX) |
(248) 0x434654 SUB %RDX,%RAX |
(248) 0x434657 VMOVSD (%R11),%XMM6 |
(248) 0x43465c INC %R12D |
(248) 0x43465f ADD %RDX,%R11 |
(248) 0x434662 VMOVSD %XMM6,(%RAX) |
(248) 0x434666 SUB %RDX,%RAX |
(248) 0x434669 CMP %R12D,%EBX |
(248) 0x43466c JE 4346f7 |
(247) 0x434672 VMOVSD (%R11),%XMM7 |
(247) 0x434677 ADD %RDX,%R11 |
(247) 0x43467a ADD $0x8,%R12D |
(247) 0x43467e VMOVSD %XMM7,(%RAX) |
(247) 0x434682 SUB %RDX,%RAX |
(247) 0x434685 VMOVSD (%R11),%XMM8 |
(247) 0x43468a ADD %RDX,%R11 |
(247) 0x43468d VMOVSD %XMM8,(%RAX) |
(247) 0x434691 SUB %RDX,%RAX |
(247) 0x434694 VMOVSD (%R11),%XMM9 |
(247) 0x434699 ADD %RDX,%R11 |
(247) 0x43469c VMOVSD %XMM9,(%RAX) |
(247) 0x4346a0 SUB %RDX,%RAX |
(247) 0x4346a3 VMOVSD (%R11),%XMM10 |
(247) 0x4346a8 ADD %RDX,%R11 |
(247) 0x4346ab VMOVSD %XMM10,(%RAX) |
(247) 0x4346af SUB %RDX,%RAX |
(247) 0x4346b2 VMOVSD (%R11),%XMM11 |
(247) 0x4346b7 ADD %RDX,%R11 |
(247) 0x4346ba VMOVSD %XMM11,(%RAX) |
(247) 0x4346be SUB %RDX,%RAX |
(247) 0x4346c1 VMOVSD (%R11),%XMM12 |
(247) 0x4346c6 ADD %RDX,%R11 |
(247) 0x4346c9 VMOVSD %XMM12,(%RAX) |
(247) 0x4346cd SUB %RDX,%RAX |
(247) 0x4346d0 VMOVSD (%R11),%XMM13 |
(247) 0x4346d5 ADD %RDX,%R11 |
(247) 0x4346d8 VMOVSD %XMM13,(%RAX) |
(247) 0x4346dc SUB %RDX,%RAX |
(247) 0x4346df VMOVSD (%R11),%XMM14 |
(247) 0x4346e4 ADD %RDX,%R11 |
(247) 0x4346e7 VMOVSD %XMM14,(%RAX) |
(247) 0x4346eb SUB %RDX,%RAX |
(247) 0x4346ee CMP %R12D,%EBX |
(247) 0x4346f1 JNE 434672 |
(248) 0x4346f7 INC %RSI |
(248) 0x4346fa LEA (%RDI,%RSI,1),%R11D |
(248) 0x4346fe CMP %R11D,%R8D |
(248) 0x434701 JG 4345a0 |
0x434707 POP %RBX |
0x434708 POP %R12 |
0x43470a POP %R13 |
0x43470c POP %R14 |
0x43470e POP %RBP |
0x43470f RET |
0x434710 INC %EAX |
0x434712 XOR %EDX,%EDX |
0x434714 JMP 434553 |
0x434719 NOPL (%RAX) |
Path / |
Source file and lines | update_halo.cpp:183-186 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 169 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 6.00 | 6.00 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
cycles | 6.00 | 9.07 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 16.79-16.94 |
Stall cycles | 6.68-6.83 |
LM full (events) | 8.27-8.42 |
Front-end | 10.17 |
Dispatch | 9.07 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 434710 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x200> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 434707 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R14,%RDX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x10(%R13),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOVSXD %EDI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 434707 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x48(%R11),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R10),%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x58(%R11),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %ESI,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOVSXD %R13D,%R10 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
IMUL %R14,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R10,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 434553 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x43> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | update_halo.cpp:183-186 |
Module | exec |
nb instructions | 56 |
nb uops | 61 |
loop length | 169 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 6.00 | 6.00 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
cycles | 6.00 | 9.07 | 4.33 | 4.33 | 3.50 | 6.00 | 6.00 | 3.50 | 3.50 | 3.50 | 6.00 | 4.33 |
Cycles executing div or sqrt instructions | 6.00 |
FE+BE cycles | 16.79-16.94 |
Stall cycles | 6.68-6.83 |
LM full (events) | 8.27-8.42 |
Front-end | 10.17 |
Dispatch | 9.07 |
DIV/SQRT | 6.00 |
Overall L1 | 10.17 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 7% |
load | 9% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 6% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 6% |
other | 7% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV 0x14(%RDI),%EBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x8(%RDI),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %EBX,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
CALL 4046c0 <omp_get_num_threads@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CALL 4045b0 <omp_get_thread_num@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV %EAX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0xc(%R13),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %EBX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x2,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB %R14D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CLTD | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
IDIV %R12D | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11-16 | 6 |
CMP %EDX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JL 434710 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x200> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
IMUL %EAX,%ECX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %EDX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %EAX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JGE 434707 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R14,%RDX,1),%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x10(%R13),%R10D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%R13),%R11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R14,%RAX,1),%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOVSXD %EDI,%RSI | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
TEST %EBX,%EBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 434707 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x1f7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x48(%R11),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA 0x1(%R10),%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV 0x58(%R11),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %ESI,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOVSXD %R13D,%R10 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
IMUL %R14,%R10 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %R14,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NEG %RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%RDX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R10,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
INC %EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 434553 <_Z18update_halo_kerneliiiiRKSt5arrayIiLm4EES2_R10field_typePKii._omp_fn.13+0x43> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼update_halo_kernel(int, int, int, int, std::array | 0.01 | 0 |
▼Loop 248 - update_halo.cpp:185-186 - exec– | 0.01 | 0.01 |
○Loop 247 - update_halo.cpp:185-186 - exec | 0 | 0 |