Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-206 [...] | Coverage: 0.03% |
---|
Function: hypre_CSRMatrixMatvecOutOfPlace.extracted.19 | Module: exec | Source: csr_matvec.c:178-206 [...] | Coverage: 0.03% |
---|
/scratch_na/users/xoserete/qaas_runs/171-415-3872/intel/AMG/build/AMG/AMG/seq_mv/csr_matvec.c: 178 - 206 |
-------------------------------------------------------------------------------- |
178: #pragma omp parallel for private(i,j,jj,m,tempx) HYPRE_SMP_SCHEDULE |
179: #endif |
180: |
181: for (i = 0; i < num_rownnz; i++) |
182: { |
183: m = A_rownnz[i]; |
[...] |
191: if ( num_vectors==1 ) |
192: { |
193: tempx = 0; |
194: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
195: tempx += A_data[jj] * x_data[A_j[jj]]; |
196: y_data[m] += tempx; |
197: } |
198: else |
199: for ( j=0; j<num_vectors; ++j ) |
200: { |
201: tempx = 0; |
202: for (jj = A_i[m]; jj < A_i[m+1]; jj++) |
203: tempx += A_data[jj] * x_data[ j*vecstride_x + A_j[jj]*idxstride_x ]; |
204: y_data[ j*vecstride_y + m*idxstride_y] += tempx; |
205: } |
206: } |
0x4de910 PUSH %RBP |
0x4de911 MOV %RSP,%RBP |
0x4de914 PUSH %R15 |
0x4de916 PUSH %R14 |
0x4de918 PUSH %R13 |
0x4de91a PUSH %R12 |
0x4de91c PUSH %RBX |
0x4de91d SUB $0x78,%RSP |
0x4de921 MOV 0x40(%RBP),%RAX |
0x4de925 MOV %RAX,-0x78(%RBP) |
0x4de929 MOV 0x38(%RBP),%R12 |
0x4de92d MOV 0x30(%RBP),%RAX |
0x4de931 MOV %RAX,-0x98(%RBP) |
0x4de938 MOV 0x28(%RBP),%RAX |
0x4de93c MOV %RAX,-0x88(%RBP) |
0x4de943 MOV 0x20(%RBP),%RAX |
0x4de947 MOV %RAX,-0x30(%RBP) |
0x4de94b MOV 0x18(%RBP),%RAX |
0x4de94f MOV %RAX,-0x70(%RBP) |
0x4de953 MOV 0x10(%RBP),%RBX |
0x4de957 MOVL $0,-0x38(%RBP) |
0x4de95e MOV %R9,-0x58(%RBP) |
0x4de962 MOV %R8,%R14 |
0x4de965 MOV %RCX,-0x50(%RBP) |
0x4de969 MOV %RDX,%R13 |
0x4de96c MOV (%RDI),%ESI |
0x4de96e MOVQ $0,-0x80(%RBP) |
0x4de976 MOVQ $0x1,-0xa0(%RBP) |
0x4de981 SUB $0x8,%RSP |
0x4de985 LEA -0xa0(%RBP),%RAX |
0x4de98c LEA -0x38(%RBP),%RCX |
0x4de990 LEA -0x80(%RBP),%R8 |
0x4de994 LEA 0x50(%RBP),%R9 |
0x4de998 MOV $0x736d70,%EDI |
0x4de99d MOV %ESI,-0x34(%RBP) |
0x4de9a0 MOV $0x22,%EDX |
0x4de9a5 PUSH $0x1 |
0x4de9a7 PUSH $0x1 |
0x4de9a9 PUSH %RAX |
0x4de9aa CALL 40fef0 <__kmpc_for_static_init_8@plt> |
0x4de9af ADD $0x20,%RSP |
0x4de9b3 MOV -0x80(%RBP),%RAX |
0x4de9b7 MOV 0x50(%RBP),%RCX |
0x4de9bb MOV %RAX,-0x48(%RBP) |
0x4de9bf SUB %RAX,%RCX |
0x4de9c2 MOV %RCX,-0x40(%RBP) |
0x4de9c6 JAE 4de9e6 |
0x4de9c8 MOV $0x736d90,%EDI |
0x4de9cd MOV -0x34(%RBP),%ESI |
0x4de9d0 ADD $0x78,%RSP |
0x4de9d4 POP %RBX |
0x4de9d5 POP %R12 |
0x4de9d7 POP %R13 |
0x4de9d9 POP %R14 |
0x4de9db POP %R15 |
0x4de9dd POP %RBP |
0x4de9de VZEROUPPER |
0x4de9e1 JMP 40fc30 |
0x4de9e6 CMPQ $0x1,-0x30(%RBP) |
0x4de9eb JNE 4deacd |
0x4de9f1 XOR %EDX,%EDX |
0x4de9f3 JMP 4dea1b |
0x4de9f5 NOPW %CS:(%RAX,%RAX,1) |
(4332) 0x4dea00 MOV -0x70(%RBP),%RCX |
(4332) 0x4dea04 VADDSD (%RCX,%RAX,8),%XMM0,%XMM0 |
(4332) 0x4dea09 VMOVSD %XMM0,(%RCX,%RAX,8) |
(4332) 0x4dea0e LEA 0x1(%RDX),%RAX |
(4332) 0x4dea12 CMP -0x40(%RBP),%RDX |
(4332) 0x4dea16 MOV %RAX,%RDX |
(4332) 0x4dea19 JE 4de9c8 |
(4332) 0x4dea1b MOV -0x48(%RBP),%RAX |
(4332) 0x4dea1f ADD %RDX,%RAX |
(4332) 0x4dea22 MOV -0x58(%RBP),%RCX |
(4332) 0x4dea26 MOV (%RCX,%RAX,8),%RAX |
(4332) 0x4dea2a MOV -0x50(%RBP),%RCX |
(4332) 0x4dea2e MOV (%RCX,%RAX,8),%RDI |
(4332) 0x4dea32 MOV 0x8(%RCX,%RAX,8),%RCX |
(4332) 0x4dea37 VXORPD %XMM0,%XMM0,%XMM0 |
(4332) 0x4dea3b MOV %RCX,%R8 |
(4332) 0x4dea3e SUB %RDI,%R8 |
(4332) 0x4dea41 JLE 4dea00 |
(4332) 0x4dea43 MOV %R8,%RSI |
(4332) 0x4dea46 AND $-0x4,%RSI |
(4332) 0x4dea4a JE 4deaa2 |
(4332) 0x4dea4c LEA -0x1(%RSI),%R9 |
(4332) 0x4dea50 LEA (%R13,%RDI,8),%R10 |
(4332) 0x4dea55 LEA (%R14,%RDI,8),%R11 |
(4332) 0x4dea59 VXORPD %XMM0,%XMM0,%XMM0 |
(4332) 0x4dea5d XOR %R12D,%R12D |
(4334) 0x4dea60 VMOVUPD (%R11,%R12,8),%YMM1 |
(4334) 0x4dea66 KXNORW %K0,%K0,%K1 |
(4334) 0x4dea6a VXORPD %XMM2,%XMM2,%XMM2 |
(4334) 0x4dea6e VGATHERQPD (%RBX,%YMM1,8),%YMM2{%K1} |
(4334) 0x4dea75 VFMADD231PD (%R10,%R12,8),%YMM2,%YMM0 |
(4334) 0x4dea7b ADD $0x4,%R12 |
(4334) 0x4dea7f CMP %R9,%R12 |
(4334) 0x4dea82 JBE 4dea60 |
(4332) 0x4dea84 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
(4332) 0x4dea8a VADDPD %XMM1,%XMM0,%XMM0 |
(4332) 0x4dea8e VSHUFPD $0x1,%XMM0,%XMM0,%XMM1 |
(4332) 0x4dea93 VADDSD %XMM1,%XMM0,%XMM0 |
(4332) 0x4dea97 CMP %RSI,%R8 |
(4332) 0x4dea9a JE 4dea00 |
(4332) 0x4deaa0 JMP 4deaa4 |
(4332) 0x4deaa2 XOR %ESI,%ESI |
(4332) 0x4deaa4 ADD %RDI,%RSI |
(4332) 0x4deaa7 NOPW (%RAX,%RAX,1) |
(4333) 0x4deab0 MOV (%R14,%RSI,8),%RDI |
(4333) 0x4deab4 VMOVSD (%RBX,%RDI,8),%XMM1 |
(4333) 0x4deab9 VFMADD231SD (%R13,%RSI,8),%XMM1,%XMM0 |
(4333) 0x4deac0 INC %RSI |
(4333) 0x4deac3 CMP %RSI,%RCX |
(4333) 0x4deac6 JNE 4deab0 |
(4332) 0x4deac8 JMP 4dea00 |
0x4deacd JL 4de9c8 |
0x4dead3 DECQ -0x30(%RBP) |
0x4dead7 VPBROADCASTQ %R12,%YMM0 |
0x4deadd XOR %ECX,%ECX |
0x4deadf JMP 4deb08 |
0x4deae1 NOPW %CS:(%RAX,%RAX,1) |
(4328) 0x4deaf0 MOV -0x90(%RBP),%RCX |
(4328) 0x4deaf7 LEA 0x1(%RCX),%RAX |
(4328) 0x4deafb CMP -0x40(%RBP),%RCX |
(4328) 0x4deaff MOV %RAX,%RCX |
(4328) 0x4deb02 JE 4de9c8 |
(4328) 0x4deb08 MOV -0x48(%RBP),%RAX |
(4328) 0x4deb0c MOV %RCX,-0x90(%RBP) |
(4328) 0x4deb13 ADD %RCX,%RAX |
(4328) 0x4deb16 MOV -0x58(%RBP),%RCX |
(4328) 0x4deb1a MOV (%RCX,%RAX,8),%RSI |
(4328) 0x4deb1e MOV -0x50(%RBP),%RAX |
(4328) 0x4deb22 MOV (%RAX,%RSI,8),%RCX |
(4328) 0x4deb26 MOV 0x8(%RAX,%RSI,8),%R8 |
(4328) 0x4deb2b MOV %R8,%RAX |
(4328) 0x4deb2e MOV %RCX,-0x68(%RBP) |
(4328) 0x4deb32 SUB %RCX,%RAX |
(4328) 0x4deb35 MOV %RAX,-0x60(%RBP) |
(4328) 0x4deb39 JLE 4deaf0 |
(4328) 0x4deb3b IMUL -0x88(%RBP),%RSI |
(4328) 0x4deb43 MOV -0x60(%RBP),%R9 |
(4328) 0x4deb47 AND $-0x4,%R9 |
(4328) 0x4deb4b LEA -0x1(%R9),%R11 |
(4328) 0x4deb4f MOV -0x68(%RBP),%RCX |
(4328) 0x4deb53 LEA (%R13,%RCX,8),%RAX |
(4328) 0x4deb58 LEA (%R14,%RCX,8),%RCX |
(4328) 0x4deb5c XOR %EDI,%EDI |
(4328) 0x4deb5e JMP 4deb8f |
(4330) 0x4deb60 MOV %RDI,%RDX |
(4330) 0x4deb63 IMUL -0x98(%RBP),%RDX |
(4330) 0x4deb6b ADD %RSI,%RDX |
(4330) 0x4deb6e MOV -0x70(%RBP),%R10 |
(4330) 0x4deb72 VADDSD (%R10,%RDX,8),%XMM1,%XMM1 |
(4330) 0x4deb78 VMOVSD %XMM1,(%R10,%RDX,8) |
(4330) 0x4deb7e LEA 0x1(%RDI),%RDX |
(4330) 0x4deb82 CMP -0x30(%RBP),%RDI |
(4330) 0x4deb86 MOV %RDX,%RDI |
(4330) 0x4deb89 JE 4deaf0 |
(4330) 0x4deb8f TEST %R9,%R9 |
(4330) 0x4deb92 JE 4dec00 |
(4330) 0x4deb94 MOV %RDI,%RDX |
(4330) 0x4deb97 IMUL -0x78(%RBP),%RDX |
(4330) 0x4deb9c VPBROADCASTQ %RDX,%YMM2 |
(4330) 0x4deba2 VXORPD %XMM1,%XMM1,%XMM1 |
(4330) 0x4deba6 XOR %EDX,%EDX |
(4330) 0x4deba8 NOPL (%RAX,%RAX,1) |
(4331) 0x4debb0 VXORPS %XMM3,%XMM3,%XMM3 |
(4331) 0x4debb4 VPMULLQ (%RCX,%RDX,8),%YMM0,%YMM3 |
(4331) 0x4debbb VPADDQ %YMM2,%YMM3,%YMM3 |
(4331) 0x4debbf KXNORW %K0,%K0,%K1 |
(4331) 0x4debc3 VXORPD %XMM4,%XMM4,%XMM4 |
(4331) 0x4debc7 VGATHERQPD (%RBX,%YMM3,8),%YMM4{%K1} |
(4331) 0x4debce VFMADD231PD (%RAX,%RDX,8),%YMM4,%YMM1 |
(4331) 0x4debd4 ADD $0x4,%RDX |
(4331) 0x4debd8 CMP %R11,%RDX |
(4331) 0x4debdb JBE 4debb0 |
(4330) 0x4debdd VEXTRACTF128 $0x1,%YMM1,%XMM2 |
(4330) 0x4debe3 VADDPD %XMM2,%XMM1,%XMM1 |
(4330) 0x4debe7 VSHUFPD $0x1,%XMM1,%XMM1,%XMM2 |
(4330) 0x4debec VADDSD %XMM2,%XMM1,%XMM1 |
(4330) 0x4debf0 MOV %R9,%R10 |
(4330) 0x4debf3 CMP %R9,-0x60(%RBP) |
(4330) 0x4debf7 JE 4deb60 |
(4330) 0x4debfd JMP 4dec07 |
0x4debff NOP |
(4330) 0x4dec00 VXORPD %XMM1,%XMM1,%XMM1 |
(4330) 0x4dec04 XOR %R10D,%R10D |
(4330) 0x4dec07 MOV %RDI,%RDX |
(4330) 0x4dec0a IMUL -0x78(%RBP),%RDX |
(4330) 0x4dec0f ADD -0x68(%RBP),%R10 |
(4330) 0x4dec13 NOPW %CS:(%RAX,%RAX,1) |
(4329) 0x4dec20 MOV (%R14,%R10,8),%R15 |
(4329) 0x4dec24 IMUL %R12,%R15 |
(4329) 0x4dec28 ADD %RDX,%R15 |
(4329) 0x4dec2b VMOVSD (%RBX,%R15,8),%XMM2 |
(4329) 0x4dec31 VFMADD231SD (%R13,%R10,8),%XMM2,%XMM1 |
(4329) 0x4dec38 INC %R10 |
(4329) 0x4dec3b CMP %R10,%R8 |
(4329) 0x4dec3e JNE 4dec20 |
(4330) 0x4dec40 JMP 4deb60 |
0x4dec45 NOPW %CS:(%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so |
Path / |
Source file and lines | csr_matvec.c:178-206 |
Module | exec |
nb instructions | 71 |
nb uops | 75 |
loop length | 287 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 21 |
micro-operation queue | 12.50 cycles |
front end | 12.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.10 | 2.00 | 6.33 | 6.33 | 12.00 | 2.00 | 1.90 | 12.00 | 12.00 | 12.00 | 2.00 | 6.33 |
cycles | 2.10 | 2.00 | 6.33 | 6.33 | 12.00 | 2.00 | 1.90 | 12.00 | 12.00 | 12.00 | 2.00 | 6.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.60 |
Stall cycles | 0.00 |
Front-end | 12.50 |
Dispatch | 12.00 |
Overall L1 | 12.50 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xa0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x38(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x80(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x736d70,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x34(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 40fef0 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x80(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4de9e6 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xd6> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x736d90,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x34(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 40fc30 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
CMPQ $0x1,-0x30(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
JNE 4deacd <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1bd> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4dea1b <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x10b> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JL 4de9c8 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xb8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
DECQ -0x30(%RBP) | 3 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4deb08 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1f8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | csr_matvec.c:178-206 |
Module | exec |
nb instructions | 71 |
nb uops | 75 |
loop length | 287 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 1 |
used zmm registers | 0 |
nb stack references | 21 |
micro-operation queue | 12.50 cycles |
front end | 12.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.10 | 2.00 | 6.33 | 6.33 | 12.00 | 2.00 | 1.90 | 12.00 | 12.00 | 12.00 | 2.00 | 6.33 |
cycles | 2.10 | 2.00 | 6.33 | 6.33 | 12.00 | 2.00 | 1.90 | 12.00 | 12.00 | 12.00 | 2.00 | 6.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 12.60 |
Stall cycles | 0.00 |
Front-end | 12.50 |
Dispatch | 12.00 |
Overall L1 | 12.50 |
all | 3% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 11% |
load | 10% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x40(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x38(%RBP),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x28(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x88(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x20(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x30(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x18(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x10(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVL $0,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RCX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RDX,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV (%RDI),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVQ $0,-0x80(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOVQ $0x1,-0xa0(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB $0x8,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA -0xa0(%RBP),%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x38(%RBP),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA -0x80(%RBP),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA 0x50(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV $0x736d70,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %ESI,-0x34(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x22,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH $0x1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RAX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
CALL 40fef0 <__kmpc_for_static_init_8@plt> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
ADD $0x20,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV -0x80(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x50(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RAX,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JAE 4de9e6 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xd6> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV $0x736d90,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x34(%RBP),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x78,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
VZEROUPPER | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JMP 40fc30 <__kmpc_for_static_fini@plt> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
CMPQ $0x1,-0x30(%RBP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
JNE 4deacd <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1bd> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4dea1b <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x10b> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JL 4de9c8 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0xb8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
DECQ -0x30(%RBP) | 3 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
VPBROADCASTQ %R12,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %ECX,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 4deb08 <hypre_CSRMatrixMatvecOutOfPlace.extracted.19+0x1f8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼hypre_CSRMatrixMatvecOutOfPlace.extracted.19– | 0.03 | 0.01 |
▼Loop 4332 - csr_matvec.c:181-206 - exec– | 0.01 | 0 |
○Loop 4334 - csr_matvec.c:194-195 - exec | 0.02 | 0.01 |
○Loop 4333 - csr_matvec.c:194-195 - exec | 0 | 0 |
▼Loop 4328 - csr_matvec.c:181-206 - exec– | 0 | 0 |
▼Loop 4330 - csr_matvec.c:199-204 - exec– | 0 | 0 |
○Loop 4329 - csr_matvec.c:202-203 - exec | 0 | 0 |
○Loop 4331 - csr_matvec.c:202-203 - exec | 0 | 0 |