Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: libseq_mv.so | Source: csr_matvec.c:178-206 [...] | Coverage: 0.03% |
---|
Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: libseq_mv.so | Source: csr_matvec.c:178-206 [...] | Coverage: 0.03% |
---|
/scratch_na/users/xoserete/qaas_runs/171-415-3872/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 178 - 206 |
-------------------------------------------------------------------------------- |
178: #pragma omp parallel for private(i,j,jj,m,tempx) HYPRE_SMP_SCHEDULE |
179: #endif |
180: |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
205: } |
206: } |
0x6600 PUSH %RBP |
0x6601 MOV %RSP,%RBP |
0x6604 PUSH %R15 |
0x6606 PUSH %R14 |
0x6608 PUSH %R13 |
0x660a PUSH %R12 |
0x660c PUSH %RBX |
0x660d SUB $0xa8,%RSP |
0x6614 MOV 0x40(%RBP),%R15 |
0x6618 MOV 0x38(%RBP),%R12 |
0x661c MOV 0x30(%RBP),%RBX |
0x6620 MOV 0x28(%RBP),%RAX |
0x6624 MOV %RAX,-0x90(%RBP) |
0x662b MOV 0x20(%RBP),%RAX |
0x662f MOV %RAX,-0x48(%RBP) |
0x6633 MOV 0x18(%RBP),%RAX |
0x6637 MOV %RAX,-0x40(%RBP) |
0x663b MOV 0x10(%RBP),%RAX |
0x663f MOV %RAX,-0x70(%RBP) |
0x6643 MOVL $0,-0x30(%RBP) |
0x664a MOV %R9,-0xa8(%RBP) |
0x6651 MOV %R8,-0x68(%RBP) |
0x6655 MOV %RCX,-0x38(%RBP) |
0x6659 MOV %RDX,-0x60(%RBP) |
0x665d MOV (%RDI),%ESI |
0x665f MOVQ $0,-0x88(%RBP) |
0x666a MOVQ $0x1,-0xc8(%RBP) |
0x6675 SUB $0x8,%RSP |
0x6679 LEA -0xc8(%RBP),%RAX |
0x6680 LEA 0x208dd9(%RIP),%RDI |
0x6687 LEA -0x30(%RBP),%RCX |
0x668b LEA -0x88(%RBP),%R8 |
0x6692 LEA 0x50(%RBP),%R9 |
0x6696 MOV %ESI,-0x2c(%RBP) |
0x6699 MOV $0x22,%EDX |
0x669e PUSH $0x1 |
0x66a0 PUSH $0x1 |
0x66a2 PUSH %RAX |
0x66a3 CALL 2240 <__kmpc_for_static_init_8@plt> |
0x66a8 ADD $0x20,%RSP |
0x66ac MOV -0x88(%RBP),%RAX |
0x66b3 MOV 0x50(%RBP),%RCX |
0x66b7 MOV %RAX,-0xa0(%RBP) |
0x66be SUB %RAX,%RCX |
0x66c1 MOV %RCX,-0x98(%RBP) |
0x66c8 JAE 66ed |
0x66ca LEA 0x208daf(%RIP),%RDI |
0x66d1 MOV -0x2c(%RBP),%ESI |
0x66d4 ADD $0xa8,%RSP |
0x66db POP %RBX |
0x66dc POP %R12 |
0x66de POP %R13 |
0x66e0 POP %R14 |
0x66e2 POP %R15 |
0x66e4 POP %RBP |
0x66e5 VZEROUPPER |
0x66e8 JMP 21d0 |
0x66ed MOV -0x48(%RBP),%RAX |
0x66f1 LEA -0x1(%RAX),%RDI |
0x66f5 VPBROADCASTQ %R12,%YMM0 |
0x66fb XOR %ECX,%ECX |
0x66fd MOV %RBX,-0x58(%RBP) |
0x6701 MOV %RDI,-0x50(%RBP) |
0x6705 JMP 673a |
0x6707 NOPW (%RAX,%RAX,1) |
(114) 0x6710 MOV -0x40(%RBP),%RAX |
(114) 0x6714 MOV (%RAX),%RAX |
(114) 0x6717 VADDSD (%RAX,%R13,8),%XMM1,%XMM1 |
(114) 0x671d VMOVSD %XMM1,(%RAX,%R13,8) |
(114) 0x6723 MOV -0xb0(%RBP),%RCX |
(114) 0x672a LEA 0x1(%RCX),%RAX |
(114) 0x672e CMP -0x98(%RBP),%RCX |
(114) 0x6735 MOV %RAX,%RCX |
(114) 0x6738 JE 66ca |
(114) 0x673a MOV -0xa0(%RBP),%RAX |
(114) 0x6741 MOV %RCX,-0xb0(%RBP) |
(114) 0x6748 ADD %RCX,%RAX |
(114) 0x674b MOV -0xa8(%RBP),%RCX |
(114) 0x6752 MOV (%RCX,%RAX,8),%RDX |
(114) 0x6756 CMPQ $0x1,-0x48(%RBP) |
(114) 0x675b JNE 6810 |
(114) 0x6761 MOV -0x38(%RBP),%RAX |
(114) 0x6765 MOV (%RAX),%RAX |
(114) 0x6768 MOV (%RAX,%RDX,8),%R10 |
(114) 0x676c MOV %RDX,%R13 |
(114) 0x676f MOV 0x8(%RAX,%RDX,8),%RAX |
(114) 0x6774 VXORPD %XMM1,%XMM1,%XMM1 |
(114) 0x6778 MOV %RAX,%R11 |
(114) 0x677b SUB %R10,%R11 |
(114) 0x677e JLE 6710 |
(114) 0x6780 MOV -0x60(%RBP),%RCX |
(114) 0x6784 MOV (%RCX),%RCX |
(114) 0x6787 MOV -0x70(%RBP),%RDX |
(114) 0x678b MOV (%RDX),%RSI |
(114) 0x678e MOV -0x68(%RBP),%RDX |
(114) 0x6792 MOV (%RDX),%R8 |
(114) 0x6795 MOV %R11,%R9 |
(114) 0x6798 AND $-0x4,%R9 |
(114) 0x679c JE 696c |
(114) 0x67a2 LEA -0x1(%R9),%RDX |
(114) 0x67a6 LEA (%RCX,%R10,8),%RDI |
(114) 0x67aa LEA (%R8,%R10,8),%RBX |
(114) 0x67ae VXORPD %XMM1,%XMM1,%XMM1 |
(114) 0x67b2 XOR %R14D,%R14D |
(114) 0x67b5 NOPW %CS:(%RAX,%RAX,1) |
(119) 0x67c0 VMOVUPD (%RBX,%R14,8),%YMM2 |
(119) 0x67c6 KXNORW %K0,%K0,%K1 |
(119) 0x67ca VXORPD %XMM3,%XMM3,%XMM3 |
(119) 0x67ce VGATHERQPD (%RSI,%YMM2,8),%YMM3{%K1} |
(119) 0x67d5 VFMADD231PD (%RDI,%R14,8),%YMM3,%YMM1 |
(119) 0x67db ADD $0x4,%R14 |
(119) 0x67df CMP %RDX,%R14 |
(119) 0x67e2 JBE 67c0 |
(114) 0x67e4 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(114) 0x67ea VADDPD %XMM2,%XMM1,%XMM1 |
(114) 0x67ee VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(114) 0x67f3 VADDSD %XMM2,%XMM1,%XMM1 |
(114) 0x67f7 CMP %R9,%R11 |
(114) 0x67fa MOV -0x58(%RBP),%RBX |
(114) 0x67fe MOV -0x50(%RBP),%RDI |
(114) 0x6802 JE 6710 |
(114) 0x6808 JMP 696f |
0x680d NOPL (%RAX) |
(114) 0x6810 JL 6723 |
(114) 0x6816 MOV -0x38(%RBP),%RAX |
(114) 0x681a MOV (%RAX),%RAX |
(114) 0x681d MOV (%RAX,%RDX,8),%RCX |
(114) 0x6821 MOV 0x8(%RAX,%RDX,8),%R9 |
(114) 0x6826 MOV -0x40(%RBP),%RAX |
(114) 0x682a MOV (%RAX),%R10 |
(114) 0x682d MOV %R9,%RAX |
(114) 0x6830 SUB %RCX,%RAX |
(114) 0x6833 MOV %RAX,-0x78(%RBP) |
(114) 0x6837 IMUL -0x90(%RBP),%RDX |
(114) 0x683f MOV %RCX,-0x80(%RBP) |
(114) 0x6843 LEA (,%RCX,8),%RAX |
(114) 0x684b MOV %RAX,-0xb8(%RBP) |
(114) 0x6852 XOR %R8D,%R8D |
(114) 0x6855 MOV %RDX,-0xc0(%RBP) |
(114) 0x685c JMP 6886 |
0x685e XCHG %AX,%AX |
(115) 0x6860 MOV %R8,%RAX |
(115) 0x6863 IMUL %RBX,%RAX |
(115) 0x6867 ADD %RDX,%RAX |
(115) 0x686a VADDSD (%R10,%RAX,8),%XMM1,%XMM1 |
(115) 0x6870 VMOVSD %XMM1,(%R10,%RAX,8) |
(115) 0x6876 LEA 0x1(%R8),%RAX |
(115) 0x687a CMP %RDI,%R8 |
(115) 0x687d MOV %RAX,%R8 |
(115) 0x6880 JE 6723 |
(115) 0x6886 VXORPD %XMM1,%XMM1,%XMM1 |
(115) 0x688a CMP -0x80(%RBP),%R9 |
(115) 0x688e JLE 6860 |
(115) 0x6890 MOV -0x60(%RBP),%RAX |
(115) 0x6894 MOV (%RAX),%R14 |
(115) 0x6897 MOV -0x70(%RBP),%RAX |
(115) 0x689b MOV (%RAX),%R13 |
(115) 0x689e MOV -0x68(%RBP),%RAX |
(115) 0x68a2 MOV (%RAX),%RSI |
(115) 0x68a5 MOV -0x78(%RBP),%R11 |
(115) 0x68a9 AND $-0x4,%R11 |
(115) 0x68ad JE 6939 |
(115) 0x68b3 LEA -0x1(%R11),%RDX |
(115) 0x68b7 MOV -0xb8(%RBP),%RCX |
(115) 0x68be LEA (%R14,%RCX,1),%RAX |
(115) 0x68c2 LEA (%RSI,%RCX,1),%RDI |
(115) 0x68c6 VXORPD %XMM1,%XMM1,%XMM1 |
(115) 0x68ca XOR %EBX,%EBX |
(115) 0x68cc NOPL (%RAX) |
(117) 0x68d0 VXORPS %XMM2,%XMM2,%XMM2 |
(117) 0x68d4 VPMULLQ (%RDI,%RBX,8),%YMM0,%YMM2 |
(117) 0x68db MOV %R8,%RCX |
(117) 0x68de IMUL %R15,%RCX |
(117) 0x68e2 VPBROADCASTQ %RCX,%YMM3 |
(117) 0x68e8 VPADDQ %YMM3,%YMM2,%YMM2 |
(117) 0x68ec KXNORW %K0,%K0,%K1 |
(117) 0x68f0 VPXOR %XMM3,%XMM3,%XMM3 |
(117) 0x68f4 VGATHERQPD (%R13,%YMM2,8),%YMM3{%K1} |
(117) 0x68fc VFMADD231PD (%RAX,%RBX,8),%YMM3,%YMM1 |
(117) 0x6902 ADD $0x4,%RBX |
(117) 0x6906 CMP %RDX,%RBX |
(117) 0x6909 JBE 68d0 |
(115) 0x690b VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(115) 0x6911 VADDPD %XMM2,%XMM1,%XMM1 |
(115) 0x6915 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(115) 0x691a VADDSD %XMM2,%XMM1,%XMM1 |
(115) 0x691e CMP %R11,-0x78(%RBP) |
(115) 0x6922 MOV -0x58(%RBP),%RBX |
(115) 0x6926 MOV -0x50(%RBP),%RDI |
(115) 0x692a MOV -0xc0(%RBP),%RDX |
(115) 0x6931 JE 6860 |
(115) 0x6937 JMP 693c |
(115) 0x6939 XOR %R11D,%R11D |
(115) 0x693c ADD -0x80(%RBP),%R11 |
(116) 0x6940 MOV (%RSI,%R11,8),%RAX |
(116) 0x6944 IMUL %R12,%RAX |
(116) 0x6948 MOV %R8,%RCX |
(116) 0x694b IMUL %R15,%RCX |
(116) 0x694f ADD %RAX,%RCX |
(116) 0x6952 VMOVSD (%R13,%RCX,8),%XMM2 |
(116) 0x6959 VFMADD231SD (%R14,%R11,8),%XMM2,%XMM1 |
(116) 0x695f INC %R11 |
(116) 0x6962 CMP %R11,%R9 |
(116) 0x6965 JNE 6940 |
(115) 0x6967 JMP 6860 |
(114) 0x696c XOR %R9D,%R9D |
(114) 0x696f ADD %R10,%R9 |
(114) 0x6972 NOPW %CS:(%RAX,%RAX,1) |
(118) 0x6980 MOV (%R8,%R9,8),%RDX |
(118) 0x6984 VMOVSD (%RSI,%RDX,8),%XMM2 |
(118) 0x6989 VFMADD231SD (%RCX,%R9,8),%XMM2,%XMM1 |
(118) 0x698f INC %R9 |
(118) 0x6992 CMP %R9,%RAX |
(118) 0x6995 JNE 6980 |
(114) 0x6997 JMP 6710 |
0x699c NOPL (%RAX) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so |
Path / |
Source file and lines | csr_matvec.c:178-206 |
Module | libseq_mv.so |
nb instructions | 68 |
nb uops | 70 |
loop length | 281 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 24 |
micro-operation queue | 11.67 cycles |
front end | 11.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.30 | 1.20 | 6.00 | 6.00 | 13.00 | 1.20 | 1.10 | 13.00 | 13.00 | 13.00 | 1.20 | 6.00 |
cycles | 1.30 | 1.20 | 6.00 | 6.00 | 13.00 | 1.20 | 1.10 | 13.00 | 13.00 | 13.00 | 1.20 | 6.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.09 |
Stall cycles | 1.28-1.29 |
RS full (events) | 3.60-3.97 |
Front-end | 11.67 |
Dispatch | 13.00 |
Overall L1 | 13.00 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x40(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0xa8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xc8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xc8(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x208dd9(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x88(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 2240 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x88(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 66ed <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xed> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA 0x208daf(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 21d0 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RAX),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RBX,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDI,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 673a <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x13a> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | csr_matvec.c:178-206 |
Module | libseq_mv.so |
nb instructions | 68 |
nb uops | 70 |
loop length | 281 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 24 |
micro-operation queue | 11.67 cycles |
front end | 11.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.30 | 1.20 | 6.00 | 6.00 | 13.00 | 1.20 | 1.10 | 13.00 | 13.00 | 13.00 | 1.20 | 6.00 |
cycles | 1.30 | 1.20 | 6.00 | 6.00 | 13.00 | 1.20 | 1.10 | 13.00 | 13.00 | 13.00 | 1.20 | 6.00 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.09 |
Stall cycles | 1.28-1.29 |
RS full (events) | 3.60-3.97 |
Front-end | 11.67 |
Dispatch | 13.00 |
Overall L1 | 13.00 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x40(%RBP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x90(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVL $0,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0xa8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,-0x68(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xc8(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xc8(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x208dd9(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x30(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x88(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %ESI,-0x2c(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 2240 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x88(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 66ed <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xed> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA 0x208daf(%RIP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x2c(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0xa8,%RSP | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 21d0 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV -0x48(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RAX),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RBX,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDI,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 673a <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x13a> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_CSRMatrixMatvecOutOfPlace.extracted.19– | 0.03 | 0.01 |
▼Loop 114 - csr_matvec.c:178-206 - libseq_mv.so– | 0.01 | 0 |
○Loop 119 - csr_matvec.c:194-195 - libseq_mv.so | 0.02 | 0 |
○Loop 118 - csr_matvec.c:194-195 - libseq_mv.so | 0 | 0 |
▼Loop 115 - csr_matvec.c:178-204 - libseq_mv.so– | 0 | 0 |
○Loop 117 - csr_matvec.c:202-203 - libseq_mv.so | 0 | 0 |
○Loop 116 - csr_matvec.c:202-203 - libseq_mv.so | 0 | 0 |