Loop Id: 560 | Module: libqmcwfs.so | Source: MultiBsplineRef.hpp:227-262 [...] | Coverage: 0.01% |
---|
Loop Id: 560 | Module: libqmcwfs.so | Source: MultiBsplineRef.hpp:227-262 [...] | Coverage: 0.01% |
---|
0x4bf4b MOV -0x178(%RBP),%RAX |
0x4bf52 MOV %RDI,%RCX |
0x4bf55 VMOVSD -0x180(%RBP),%XMM11 |
0x4bf5d SUB %RAX,%RCX |
0x4bf60 MOV -0x170(%RBP),%RAX |
0x4bf67 MOV %RCX,-0x160(%RBP) |
0x4bf6e MOV -0x198(%RBP),%RCX |
0x4bf75 VMOVSD (%RCX,%RAX,1),%XMM8 |
0x4bf7a MOV -0x1a8(%RBP),%RCX |
0x4bf81 VMOVSD (%RCX,%RAX,1),%XMM13 |
0x4bf86 MOV -0x1a0(%RBP),%RCX |
0x4bf8d VMULSD %XMM8,%XMM10,%XMM9 |
0x4bf92 VMULSD %XMM11,%XMM8,%XMM14 |
0x4bf97 VMULSD %XMM6,%XMM8,%XMM12 |
0x4bf9b VMULSD (%RCX,%RAX,1),%XMM6,%XMM8 |
0x4bfa0 MOV -0x190(%RBP),%ECX |
0x4bfa6 XOR %EAX,%EAX |
0x4bfa8 VMULSD %XMM11,%XMM13,%XMM15 |
0x4bfad VMULSD %XMM13,%XMM6,%XMM13 |
0x4bfb2 TEST %ECX,%ECX |
0x4bfb4 JLE 4c0d8 |
0x4bfba NOPW (%RAX,%RAX,1) |
(563) 0x4bfc0 VMOVSD (%RDX,%RAX,8),%XMM1 |
(563) 0x4bfc5 MOV -0x158(%RBP),%RCX |
(563) 0x4bfcc VMOVHPD (%RCX,%RAX,8),%XMM1,%XMM2 |
(563) 0x4bfd1 MOV -0x160(%RBP),%RCX |
(563) 0x4bfd8 VMOVSD (%RCX,%RAX,8),%XMM0 |
(563) 0x4bfdd MOV -0x168(%RBP),%RCX |
(563) 0x4bfe4 VMOVHPD (%RDI,%RAX,8),%XMM0,%XMM11 |
(563) 0x4bfe9 VINSERTF128 $0x1,%XMM2,%YMM11,%YMM1 |
(563) 0x4bfef VMULPD %YMM3,%YMM1,%YMM11 |
(563) 0x4bff3 VMULPD %YMM4,%YMM1,%YMM2 |
(563) 0x4bff7 VMULPD %YMM5,%YMM1,%YMM1 |
(563) 0x4bffb VEXTRACTF128 $0x1,%YMM1,%XMM0 |
(563) 0x4c001 VADDPD %XMM1,%XMM0,%XMM1 |
(563) 0x4c005 VUNPCKHPD %XMM1,%XMM1,%XMM0 |
(563) 0x4c009 VADDPD %XMM1,%XMM0,%XMM1 |
(563) 0x4c00d VEXTRACTF128 $0x1,%YMM2,%XMM0 |
(563) 0x4c013 VADDPD %XMM2,%XMM0,%XMM0 |
(563) 0x4c017 VUNPCKHPD %XMM0,%XMM0,%XMM2 |
(563) 0x4c01b VADDPD %XMM0,%XMM2,%XMM0 |
(563) 0x4c01f VEXTRACTF128 $0x1,%YMM11,%XMM2 |
(563) 0x4c025 VADDPD %XMM11,%XMM2,%XMM2 |
(563) 0x4c02a VUNPCKHPD %XMM2,%XMM2,%XMM11 |
(563) 0x4c02e VADDPD %XMM2,%XMM11,%XMM2 |
(563) 0x4c032 VMOVSD %XMM9,%XMM9,%XMM11 |
(563) 0x4c037 VFMADD213SD (%R9,%RAX,8),%XMM1,%XMM11 |
(563) 0x4c03d VMOVSD %XMM11,(%R9,%RAX,8) |
(563) 0x4c043 VMOVSD %XMM15,%XMM15,%XMM11 |
(563) 0x4c048 VFMADD213SD (%R14,%RAX,8),%XMM1,%XMM11 |
(563) 0x4c04e VMOVSD %XMM11,(%R14,%RAX,8) |
(563) 0x4c054 VMOVSD %XMM14,%XMM14,%XMM11 |
(563) 0x4c059 VFMADD213SD (%R15,%RAX,8),%XMM0,%XMM11 |
(563) 0x4c05f VMOVSD %XMM11,(%R15,%RAX,8) |
(563) 0x4c065 VMOVSD %XMM8,%XMM8,%XMM11 |
(563) 0x4c06a VFMADD213SD (%R13,%RAX,8),%XMM1,%XMM11 |
(563) 0x4c071 VMOVSD %XMM11,(%R13,%RAX,8) |
(563) 0x4c078 VMOVSD %XMM13,%XMM13,%XMM11 |
(563) 0x4c07d VFMADD213SD (%R10,%RAX,8),%XMM0,%XMM11 |
(563) 0x4c083 VMOVSD %XMM11,(%R10,%RAX,8) |
(563) 0x4c089 VFMADD213SD (%R8,%RAX,8),%XMM12,%XMM2 |
(563) 0x4c08f VMOVSD %XMM13,%XMM13,%XMM11 |
(563) 0x4c094 VMOVSD %XMM2,(%R8,%RAX,8) |
(563) 0x4c09a VMOVSD %XMM14,%XMM14,%XMM2 |
(563) 0x4c09e VFMADD213SD (%RBX,%RAX,8),%XMM1,%XMM2 |
(563) 0x4c0a4 VMOVSD %XMM2,(%RBX,%RAX,8) |
(563) 0x4c0a9 VFMADD213SD (%R12,%RAX,8),%XMM1,%XMM11 |
(563) 0x4c0af VFMADD213SD (%RSI,%RAX,8),%XMM12,%XMM1 |
(563) 0x4c0b5 VMOVSD %XMM11,(%R12,%RAX,8) |
(563) 0x4c0bb VFMADD213SD (%R11,%RAX,8),%XMM12,%XMM0 |
(563) 0x4c0c1 VMOVSD %XMM1,(%RSI,%RAX,8) |
(563) 0x4c0c6 VMOVSD %XMM0,(%R11,%RAX,8) |
(563) 0x4c0cc INC %RAX |
(563) 0x4c0cf CMP %RCX,%RAX |
(563) 0x4c0d2 JNE 4bfc0 |
0x4c0d8 MOV -0x188(%RBP),%RAX |
0x4c0df ADDQ $0x8,-0x170(%RBP) |
0x4c0e7 MOV -0x170(%RBP),%RCX |
0x4c0ee ADD %RAX,-0x158(%RBP) |
0x4c0f5 ADD %RAX,%RDI |
0x4c0f8 ADD %RAX,%RDX |
0x4c0fb CMP $0x20,%RCX |
0x4c0ff JNE 4bf4b |
/home/eoseret/qaas_runs_CPU_9468/171-145-9236/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 227 - 262 |
-------------------------------------------------------------------------------- |
227: for (int j = 0; j < 4; j++) |
228: { |
229: const T* restrict coefs = spline_m->coefs + (ix + i) * xs + (iy + j) * ys + iz * zs; |
230: const T* restrict coefszs = coefs + zs; |
231: const T* restrict coefs2zs = coefs + 2 * zs; |
232: const T* restrict coefs3zs = coefs + 3 * zs; |
233: |
234: const T pre20 = d2a[i] * b[j]; |
235: const T pre10 = da[i] * b[j]; |
236: const T pre00 = a[i] * b[j]; |
237: const T pre11 = da[i] * db[j]; |
238: const T pre01 = a[i] * db[j]; |
239: const T pre02 = a[i] * d2b[j]; |
240: |
241: const int iSplitPoint = num_splines; |
242: for (int n = 0; n < iSplitPoint; n++) |
[...] |
249: T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs; |
250: T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs; |
251: T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs; |
252: |
253: hxx[n] += pre20 * sum0; |
254: hxy[n] += pre11 * sum0; |
255: hxz[n] += pre10 * sum1; |
256: hyy[n] += pre02 * sum0; |
257: hyz[n] += pre01 * sum1; |
258: hzz[n] += pre00 * sum2; |
259: gx[n] += pre10 * sum0; |
260: gy[n] += pre01 * sum0; |
261: gz[n] += pre00 * sum1; |
262: vals[n] += pre00 * sum0; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.75 |
CQA speedup if FP arith vectorized | 2.25 |
CQA speedup if fully vectorized | 10.85 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.12 |
Bottlenecks | |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.25 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 2.33 |
CQA cycles if fully vectorized | 0.48 |
Front-end cycles | 5.25 |
DIV/SQRT cycles | 3.00 |
P0 cycles | 3.00 |
P1 cycles | 4.67 |
P2 cycles | 4.67 |
P3 cycles | 1.50 |
P4 cycles | 2.80 |
P5 cycles | 2.60 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 2.60 |
P10 cycles | 4.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 5.77 |
Stall cycles (UFS) | 0.00 |
Nb insns | 29.50 |
Nb uops | 30.50 |
Nb loads | 14.00 |
Nb stores | 3.00 |
Nb stack references | 10.00 |
FLOP/cycle | 1.14 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 25.15 |
Bytes prefetched | 0.00 |
Bytes loaded | 108.00 |
Bytes stored | 24.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 7.50 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.08 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 6.25 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.78 |
CQA speedup if FP arith vectorized | 2.29 |
CQA speedup if fully vectorized | 10.91 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.14 |
Bottlenecks | micro-operation queue, |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.33 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 2.33 |
CQA cycles if fully vectorized | 0.49 |
Front-end cycles | 5.33 |
DIV/SQRT cycles | 3.00 |
P0 cycles | 3.00 |
P1 cycles | 4.67 |
P2 cycles | 4.67 |
P3 cycles | 1.50 |
P4 cycles | 2.80 |
P5 cycles | 2.60 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 2.60 |
P10 cycles | 4.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 5.86 |
Stall cycles (UFS) | 0.00 |
Nb insns | 30.00 |
Nb uops | 31.00 |
Nb loads | 14.00 |
Nb stores | 3.00 |
Nb stack references | 10.00 |
FLOP/cycle | 1.13 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 24.75 |
Bytes prefetched | 0.00 |
Bytes loaded | 108.00 |
Bytes stored | 24.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 14.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.08 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 6.25 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.72 |
CQA speedup if FP arith vectorized | 2.21 |
CQA speedup if fully vectorized | 10.79 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.11 |
Bottlenecks | micro-operation queue, |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source | MultiBsplineRef.hpp:227-229,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.17 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 2.33 |
CQA cycles if fully vectorized | 0.48 |
Front-end cycles | 5.17 |
DIV/SQRT cycles | 3.00 |
P0 cycles | 3.00 |
P1 cycles | 4.67 |
P2 cycles | 4.67 |
P3 cycles | 1.50 |
P4 cycles | 2.80 |
P5 cycles | 2.60 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 2.60 |
P10 cycles | 4.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 5.68 |
Stall cycles (UFS) | 0.00 |
Nb insns | 29.00 |
Nb uops | 30.00 |
Nb loads | 14.00 |
Nb stores | 3.00 |
Nb stack references | 10.00 |
FLOP/cycle | 1.16 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 25.55 |
Bytes prefetched | 0.00 |
Bytes loaded | 108.00 |
Bytes stored | 24.00 |
Stride 0 | 1.00 |
Stride 1 | 0.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.08 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 6.25 |
Path / |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | libqmcwfs.so |
nb instructions | 29.50 |
nb uops | 30.50 |
loop length | 159 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 5.25 cycles |
front end | 5.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 4.67 | 4.67 | 1.50 | 2.80 | 2.60 | 1.50 | 1.50 | 1.50 | 2.60 | 4.67 |
cycles | 3.00 | 3.00 | 4.67 | 4.67 | 1.50 | 2.80 | 2.60 | 1.50 | 1.50 | 1.50 | 2.60 | 4.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 5.77 |
Stall cycles | 0.00 |
Front-end | 5.25 |
Dispatch | 4.67 |
Data deps. | 1.00 |
Overall L1 | 5.25 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 6% |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | libqmcwfs.so |
nb instructions | 30 |
nb uops | 31 |
loop length | 162 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 5.33 cycles |
front end | 5.33 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 4.67 | 4.67 | 1.50 | 2.80 | 2.60 | 1.50 | 1.50 | 1.50 | 2.60 | 4.67 |
cycles | 3.00 | 3.00 | 4.67 | 4.67 | 1.50 | 2.80 | 2.60 | 1.50 | 1.50 | 1.50 | 2.60 | 4.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 5.86 |
Stall cycles | 0.00 |
Front-end | 5.33 |
Dispatch | 4.67 |
Data deps. | 1.00 |
Overall L1 | 5.33 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 6% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x178(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RDI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVSD -0x180(%RBP),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x170(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x160(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x198(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,1),%XMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x1a8(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,1),%XMM13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x1a0(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULSD %XMM8,%XMM10,%XMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM8,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM6,%XMM8,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD (%RCX,%RAX,1),%XMM6,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV -0x190(%RBP),%ECX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD %XMM11,%XMM13,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM13,%XMM6,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
TEST %ECX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 4c0d8 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x618> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x188(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADDQ $0x8,-0x170(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
MOV -0x170(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,-0x158(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
ADD %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x20,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 4bf4b <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x48b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | _ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m |
Source file and lines | MultiBsplineRef.hpp:227-262 |
Module | libqmcwfs.so |
nb instructions | 29 |
nb uops | 30 |
loop length | 156 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 9 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 10 |
micro-operation queue | 5.17 cycles |
front end | 5.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 3.00 | 3.00 | 4.67 | 4.67 | 1.50 | 2.80 | 2.60 | 1.50 | 1.50 | 1.50 | 2.60 | 4.67 |
cycles | 3.00 | 3.00 | 4.67 | 4.67 | 1.50 | 2.80 | 2.60 | 1.50 | 1.50 | 1.50 | 2.60 | 4.67 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 5.68 |
Stall cycles | 0.00 |
Front-end | 5.17 |
Dispatch | 4.67 |
Data deps. | 1.00 |
Overall L1 | 5.17 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 6% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x178(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RDI,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
VMOVSD -0x180(%RBP),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV -0x170(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x160(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV -0x198(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,1),%XMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x1a8(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,1),%XMM13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x1a0(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULSD %XMM8,%XMM10,%XMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM11,%XMM8,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM6,%XMM8,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD (%RCX,%RAX,1),%XMM6,%XMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
MOV -0x190(%RBP),%ECX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
XOR %EAX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMULSD %XMM11,%XMM13,%XMM15 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM13,%XMM6,%XMM13 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
TEST %ECX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 4c0d8 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x618> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x188(%RBP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADDQ $0x8,-0x170(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
MOV -0x170(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,-0x158(%RBP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
ADD %RAX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %RAX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP $0x20,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JNE 4bf4b <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x48b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |