Function: hypre_IJMatrixAssembleParCSR._omp_fn.1 | Module: exec | Source: IJMatrix_parcsr.c:2798-2812 | Coverage: 0.56% |
---|
Function: hypre_IJMatrixAssembleParCSR._omp_fn.1 | Module: exec | Source: IJMatrix_parcsr.c:2798-2812 | Coverage: 0.56% |
---|
/home/kcamus/qaas_runs/169-443-9681/intel/AMG/build/AMG/AMG/IJ_mv/IJMatrix_parcsr.c: 2798 - 2812 |
-------------------------------------------------------------------------------- |
2798: #pragma omp parallel for private (i,j,j0,temp) |
2799: #endif |
2800: for (i=0; i < num_rows; i++) |
2801: { |
2802: j0 = diag_i[i]; |
2803: for (j=j0; j < diag_i[i+1]; j++) |
2804: { |
2805: diag_j[j] -= col_0; |
2806: if (diag_j[j] == i) |
2807: { |
2808: temp = diag_data[j0]; |
2809: diag_data[j0] = diag_data[j]; |
2810: diag_data[j] = temp; |
2811: diag_j[j] = diag_j[j0]; |
2812: diag_j[j0] = i; |
0x57f7b0 PUSH %RBP |
0x57f7b1 MOV %RSP,%RBP |
0x57f7b4 PUSH %R13 |
0x57f7b6 MOV %RDI,%R13 |
0x57f7b9 PUSH %R12 |
0x57f7bb PUSH %RBX |
0x57f7bc SUB $0x8,%RSP |
0x57f7c0 CALL 40f0b0 <omp_get_num_threads@plt> |
0x57f7c5 MOV %EAX,%EBX |
0x57f7c7 CALL 40f1f0 <omp_get_thread_num@plt> |
0x57f7cc MOVSXD %EBX,%RSI |
0x57f7cf MOVSXD %EAX,%RCX |
0x57f7d2 MOV 0x18(%R13),%RAX |
0x57f7d6 CQTO |
0x57f7d8 IDIV %RSI |
0x57f7db CMP %RDX,%RCX |
0x57f7de JL 57f89e |
0x57f7e4 IMUL %RAX,%RCX |
0x57f7e8 ADD %RCX,%RDX |
0x57f7eb LEA (%RAX,%RDX,1),%R12 |
0x57f7ef CMP %R12,%RDX |
0x57f7f2 JGE 57f893 |
0x57f7f8 MOV (%R13),%R8 |
0x57f7fc LEA 0x1(%RDX),%RBX |
0x57f800 MOV 0x20(%R13),%R10 |
0x57f804 MOV 0x10(%R13),%R11 |
0x57f808 MOV 0x8(%R13),%RDI |
0x57f80c LEA (%R8,%RBX,8),%R13 |
(2823) 0x57f810 MOV -0x8(%R13),%RCX |
(2823) 0x57f814 LEA (,%RCX,8),%R9 |
(2823) 0x57f81c LEA (%R11,%R9,1),%RAX |
(2823) 0x57f820 ADD %RDI,%R9 |
(2823) 0x57f823 CMP (%R13),%RCX |
(2823) 0x57f827 JGE 57f849 |
(2823) 0x57f829 NOPL (%RAX) |
(2824) 0x57f830 MOV (%RDI,%RCX,8),%RSI |
(2824) 0x57f834 SUB %R10,%RSI |
(2824) 0x57f837 MOV %RSI,(%RDI,%RCX,8) |
(2824) 0x57f83b CMP %RSI,%RDX |
(2824) 0x57f83e JE 57f860 |
(2824) 0x57f840 INC %RCX |
(2824) 0x57f843 CMP %RCX,(%R13) |
(2824) 0x57f847 JG 57f830 |
(2823) 0x57f849 MOV %RBX,%RDX |
(2823) 0x57f84c ADD $0x8,%R13 |
(2823) 0x57f850 CMP %RBX,%R12 |
(2823) 0x57f853 JE 57f893 |
(2823) 0x57f855 INC %RBX |
(2823) 0x57f858 JMP 57f810 |
0x57f85a NOPW (%RAX,%RAX,1) |
(2824) 0x57f860 VMOVSD (%R11,%RCX,8),%XMM1 |
(2824) 0x57f866 VMOVSD (%RAX),%XMM0 |
(2824) 0x57f86a MOV (%R9),%R8 |
(2824) 0x57f86d VMOVSD %XMM1,(%RAX) |
(2824) 0x57f871 VMOVSD %XMM0,(%R11,%RCX,8) |
(2824) 0x57f877 MOV %R8,(%RDI,%RCX,8) |
(2824) 0x57f87b INC %RCX |
(2824) 0x57f87e MOV %RDX,(%R9) |
(2824) 0x57f881 CMP %RCX,(%R13) |
(2824) 0x57f885 JG 57f830 |
(2823) 0x57f887 MOV %RBX,%RDX |
(2823) 0x57f88a ADD $0x8,%R13 |
(2823) 0x57f88e CMP %RBX,%R12 |
(2823) 0x57f891 JNE 57f855 |
0x57f893 ADD $0x8,%RSP |
0x57f897 POP %RBX |
0x57f898 POP %R12 |
0x57f89a POP %R13 |
0x57f89c POP %RBP |
0x57f89d RET |
0x57f89e INC %RAX |
0x57f8a1 XOR %EDX,%EDX |
0x57f8a3 JMP 57f7e4 |
0x57f8a8 NOPL (%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
○100.00 | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Source file and lines | IJMatrix_parcsr.c:2798-2812 |
Module | exec |
nb instructions | 39 |
nb uops | 97 |
loop length | 131 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 24.25 cycles |
front end | 24.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
cycles | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
Cycles executing div or sqrt instructions | 24.00-90.00 |
FE+BE cycles | 24.32-90.19 |
Stall cycles | 13.51-79.38 |
ROB full (events) | 14.74-80.63 |
Front-end | 24.25 |
Dispatch | 19.00 |
DIV/SQRT | 24.00-90.00 |
Overall L1 | 24.25-90.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
SUB $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CALL 40f0b0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 40f1f0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOVSXD %EBX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOVSXD %EAX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x18(%R13),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CQTO | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
IDIV %RSI | 57 | 14.25 | 14.25 | 0 | 0 | 0 | 14.25 | 14.25 | 0 | 42-95 | 24-90 |
CMP %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JL 57f89e | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
IMUL %RAX,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%RAX,%RDX,1),%R12 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %R12,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JGE 57f893 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV (%R13),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA 0x1(%RDX),%RBX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%R13),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x10(%R13),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x8(%R13),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R8,%RBX,8),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
ADD $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 57f7e4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Source file and lines | IJMatrix_parcsr.c:2798-2812 |
Module | exec |
nb instructions | 39 |
nb uops | 97 |
loop length | 131 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 24.25 cycles |
front end | 24.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
cycles | 19.00 | 19.00 | 5.50 | 5.17 | 6.00 | 19.00 | 19.00 | 5.33 |
Cycles executing div or sqrt instructions | 24.00-90.00 |
FE+BE cycles | 24.32-90.19 |
Stall cycles | 13.51-79.38 |
ROB full (events) | 14.74-80.63 |
Front-end | 24.25 |
Dispatch | 19.00 |
DIV/SQRT | 24.00-90.00 |
Overall L1 | 24.25-90.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 12% |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RDI,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
SUB $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CALL 40f0b0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV %EAX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
CALL 40f1f0 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOVSXD %EBX,%RSI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOVSXD %EAX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV 0x18(%R13),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CQTO | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
IDIV %RSI | 57 | 14.25 | 14.25 | 0 | 0 | 0 | 14.25 | 14.25 | 0 | 42-95 | 24-90 |
CMP %RDX,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JL 57f89e | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
IMUL %RAX,%RCX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %RCX,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
LEA (%RAX,%RDX,1),%R12 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %R12,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JGE 57f893 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV (%R13),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA 0x1(%RDX),%RBX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%R13),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x10(%R13),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x8(%R13),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R8,%RBX,8),%R13 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
ADD $0x8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
INC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 57f7e4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_IJMatrixAssembleParCSR._omp_fn.1– | 0.56 | 0.2 |
▼Loop 2823 - IJMatrix_parcsr.c:2802-2812 - exec– | 0 | 0 |
○Loop 2824 - IJMatrix_parcsr.c:2803-2812 - exec | 0.56 | 0.2 |