Loop Id: 315 | Module: exec | Source: BsplineFunctor.h:303-338 [...] | Coverage: 0.06% |
---|
Loop Id: 315 | Module: exec | Source: BsplineFunctor.h:303-338 [...] | Coverage: 0.06% |
---|
0x41ca50 VMOVUPD (%R8,%RBX,8),%YMM10 [18] |
0x41ca56 VMULPD %YMM15,%YMM10,%YMM2 |
0x41ca5b VCVTTPD2DQ %YMM2,%XMM1 |
0x41ca5f VPMOVSXDQ %XMM1,%YMM1 |
0x41ca64 VPSLLQ $0x3,%YMM1,%YMM1 |
0x41ca69 VMOVQ %RSI,%XMM4 |
0x41ca6e VPBROADCASTQ %XMM4,%YMM4 |
0x41ca73 VROUNDPD $0xb,%YMM2,%YMM5 |
0x41ca79 VPADDQ %YMM1,%YMM4,%YMM1 |
0x41ca7d VMOVQ %XMM1,%R14 |
0x41ca82 VEXTRACTI128 $0x1,%YMM1,%XMM8 |
0x41ca88 VSUBPD %YMM5,%YMM2,%YMM4 |
0x41ca8c VMOVQ %XMM8,%R15 |
0x41ca91 VMOVSD (%R15),%XMM2 [7] |
0x41ca96 VPEXTRQ $0x1,%XMM1,%R15 |
0x41ca9c VMOVSD (%R14),%XMM5 [5] |
0x41caa1 VPADDQ %YMM6,%YMM1,%YMM7 |
0x41caa5 VPEXTRQ $0x1,%XMM8,%R14 |
0x41caab VMOVQ %XMM7,%R12 |
0x41cab0 VEXTRACTI128 $0x1,%YMM7,%XMM8 |
0x41cab6 VPEXTRQ $0x1,%XMM7,%R13 |
0x41cabc VPEXTRQ $0x1,%XMM8,%R10 |
0x41cac2 VMOVHPD (%R15),%XMM5,%XMM7 [25] |
0x41cac7 VMOVQ %XMM8,%R15 |
0x41cacc VMOVSD (%R15),%XMM5 [6] |
0x41cad1 VMOVHPD (%R14),%XMM2,%XMM2 [21] |
0x41cad6 VMOVHPD (%R10),%XMM5,%XMM5 [13] |
0x41cadb VMOVSD (%R12),%XMM8 [17] |
0x41cae1 VINSERTF128 $0x1,%XMM2,%YMM7,%YMM2 |
0x41cae7 VMOVHPD (%R13),%XMM8,%XMM7 [20] |
0x41caed VMOVAPD %YMM4,%YMM8 |
0x41caf1 VMOVAPD %YMM4,%YMM9 |
0x41caf5 VMOVUPD 0x540(%RSP),%YMM0 [32] |
0x41cafe VFMADD132PD 0x560(%RSP),%YMM0,%YMM9 [32] |
0x41cb08 VINSERTF128 $0x1,%XMM5,%YMM7,%YMM7 |
0x41cb0e VFMADD213PD 0x520(%RSP),%YMM4,%YMM9 [32] |
0x41cb18 VMOVAPD %YMM4,%YMM5 |
0x41cb1c VMOVUPD 0x620(%RSP),%YMM0 [32] |
0x41cb25 VFMADD213PD 0x40(%RSP),%YMM0,%YMM5 [32] |
0x41cb2c VMOVUPD 0x640(%RSP),%YMM0 [32] |
0x41cb35 VFMADD132PD 0x660(%RSP),%YMM0,%YMM8 [32] |
0x41cb3f VMULPD %YMM2,%YMM5,%YMM5 |
0x41cb43 VFMADD213PD %YMM5,%YMM7,%YMM8 |
0x41cb48 VMOVAPD %YMM4,%YMM5 |
0x41cb4c VMOVUPD 0x4e0(%RSP),%YMM0 [32] |
0x41cb55 VFMADD132PD 0x500(%RSP),%YMM0,%YMM5 [32] |
0x41cb5f VFMADD213PD 0x4c0(%RSP),%YMM4,%YMM5 [32] |
0x41cb69 VMULPD %YMM2,%YMM9,%YMM9 |
0x41cb6d VFMADD213PD %YMM9,%YMM7,%YMM5 |
0x41cb72 VMOVAPD %YMM4,%YMM9 |
0x41cb76 VMOVUPD 0x3c0(%RSP),%YMM0 [32] |
0x41cb7f VFMADD132PD 0x3e0(%RSP),%YMM0,%YMM9 [32] |
0x41cb89 VFMADD213PD 0x3a0(%RSP),%YMM4,%YMM9 [32] |
0x41cb93 VFMADD213PD 0x380(%RSP),%YMM4,%YMM9 [32] |
0x41cb9d VMULPD %YMM2,%YMM9,%YMM9 |
0x41cba1 VMOVAPD %YMM4,%YMM2 |
0x41cba5 VMOVUPD 0x340(%RSP),%YMM0 [32] |
0x41cbae VFMADD132PD 0x360(%RSP),%YMM0,%YMM2 [32] |
0x41cbb8 VFMADD213PD 0x320(%RSP),%YMM4,%YMM2 [32] |
0x41cbc2 VFMADD213PD 0x300(%RSP),%YMM4,%YMM2 [32] |
0x41cbcc VFMADD213PD %YMM9,%YMM7,%YMM2 |
0x41cbd1 VPADDQ %YMM3,%YMM1,%YMM7 |
0x41cbd5 VPEXTRQ $0x1,%XMM7,%R10 |
0x41cbdb VMOVQ %XMM7,%R14 |
0x41cbe0 VEXTRACTI128 $0x1,%YMM7,%XMM7 |
0x41cbe6 VPEXTRQ $0x1,%XMM7,%R15 |
0x41cbec VMOVQ %XMM7,%R12 |
0x41cbf1 VMOVSD (%R12),%XMM7 [16] |
0x41cbf7 VPADDQ %YMM1,%YMM13,%YMM1 |
0x41cbfb VMOVQ %XMM1,%R12 |
0x41cc00 VPEXTRQ $0x1,%XMM1,%R13 |
0x41cc06 VMOVHPD (%R15),%XMM7,%XMM7 [24] |
0x41cc0b VEXTRACTI128 $0x1,%YMM1,%XMM1 |
0x41cc11 VMOVQ %XMM1,%R15 |
0x41cc16 VPEXTRQ $0x1,%XMM1,%R8 |
0x41cc1c VMOVSD (%R14),%XMM1 [8] |
0x41cc21 VMOVHPD (%R10),%XMM1,%XMM1 [11] |
0x41cc26 VINSERTF128 $0x1,%XMM7,%YMM1,%YMM9 |
0x41cc2c VMOVAPD %YMM4,%YMM7 |
0x41cc30 VMOVUPD 0x5e0(%RSP),%YMM0 [32] |
0x41cc39 VFMADD132PD 0x600(%RSP),%YMM0,%YMM7 [32] |
0x41cc43 VFMADD213PD %YMM8,%YMM9,%YMM7 |
0x41cc48 VMOVAPD %YMM4,%YMM8 |
0x41cc4c VMOVUPD 0x480(%RSP),%YMM0 [32] |
0x41cc55 VFMADD132PD 0x4a0(%RSP),%YMM0,%YMM8 [32] |
0x41cc5f VFMADD213PD 0x460(%RSP),%YMM4,%YMM8 [32] |
0x41cc69 VFMADD213PD %YMM5,%YMM9,%YMM8 |
0x41cc6e VMOVAPD %YMM4,%YMM1 |
0x41cc72 VMOVUPD 0x2c0(%RSP),%YMM0 [32] |
0x41cc7b VFMADD132PD 0x2e0(%RSP),%YMM0,%YMM1 [32] |
0x41cc85 VFMADD213PD 0x2a0(%RSP),%YMM4,%YMM1 [32] |
0x41cc8f VFMADD213PD 0x280(%RSP),%YMM4,%YMM1 [32] |
0x41cc99 VFMADD213PD %YMM2,%YMM9,%YMM1 |
0x41cc9e VMOVSD (%R15),%XMM2 [23] |
0x41cca3 VMOVHPD (%R8),%XMM2,%XMM2 [1] |
0x41cca8 VMOVSD (%R12),%XMM5 [30] |
0x41ccae VMOVHPD (%R13),%XMM5,%XMM5 [19] |
0x41ccb4 VINSERTF128 $0x1,%XMM2,%YMM5,%YMM5 |
0x41ccba VMOVAPD %YMM4,%YMM2 |
0x41ccbe VMOVUPD 0x420(%RSP),%YMM0 [32] |
0x41ccc7 VFMADD132PD 0x440(%RSP),%YMM0,%YMM2 [32] |
0x41ccd1 VFMADD213PD 0x400(%RSP),%YMM4,%YMM2 [32] |
0x41ccdb VFMADD213PD %YMM8,%YMM5,%YMM2 |
0x41cce0 VMULPD %YMM2,%YMM15,%YMM2 |
0x41cce4 VDIVPD %YMM10,%YMM2,%YMM8 |
0x41cce9 VMOVAPD %YMM4,%YMM2 |
0x41cced VFMADD132PD 0x260(%RSP),%YMM12,%YMM2 [32] |
0x41ccf7 VFMADD213PD %YMM14,%YMM4,%YMM2 |
0x41ccfc VFMADD213PD %YMM11,%YMM4,%YMM2 |
0x41cd01 VMOVUPD 0x5a0(%RSP),%YMM0 [32] |
0x41cd0a VFMADD132PD 0x5c0(%RSP),%YMM0,%YMM4 [32] |
0x41cd14 VFMADD213PD %YMM7,%YMM5,%YMM4 |
0x41cd19 MOVSXD (%RAX,%RBX,4),%R8 [27] |
0x41cd1d MOVSXD 0x4(%RAX,%RBX,4),%R10 [27] |
0x41cd22 MOVSXD 0x8(%RAX,%RBX,4),%R14 [27] |
0x41cd27 MOVSXD 0xc(%RAX,%RBX,4),%R15 [27] |
0x41cd2c VMULPD 0x580(%RSP),%YMM4,%YMM4 [32] |
0x41cd35 ADD %R11,%R8 |
0x41cd38 VMOVLPD %XMM4,(%RCX,%R8,8) [28] |
0x41cd3e ADD %R11,%R10 |
0x41cd41 VMOVHPD %XMM4,(%RCX,%R10,8) [4] |
0x41cd47 VEXTRACTF128 $0x1,%YMM4,%XMM4 |
0x41cd4d ADD %R11,%R14 |
0x41cd50 ADD %R11,%R15 |
0x41cd53 VMOVLPD %XMM4,(%RCX,%R14,8) [14] |
0x41cd59 VMOVHPD %XMM4,(%RCX,%R15,8) [10] |
0x41cd5f VMOVLPD %XMM8,(%RDX,%R8,8) [29] |
0x41cd65 VFMADD213PD %YMM1,%YMM5,%YMM2 |
0x41cd6a VMOVHPD %XMM8,(%RDX,%R10,8) [3] |
0x41cd70 VEXTRACTF128 $0x1,%YMM8,%XMM1 |
0x41cd76 VMOVLPD %XMM1,(%RDX,%R14,8) [15] |
0x41cd7c VMOVHPD %XMM1,(%RDX,%R15,8) [12] |
0x41cd82 VMOVLPD %XMM2,(%R9,%R8,8) [2] |
0x41cd88 MOV 0x20(%RBP),%R8 [31] |
0x41cd8c VMOVHPD %XMM2,(%R9,%R10,8) [9] |
0x41cd92 VEXTRACTF128 $0x1,%YMM2,%XMM1 |
0x41cd98 VMOVLPD %XMM1,(%R9,%R14,8) [26] |
0x41cd9e VMOVHPD %XMM1,(%R9,%R15,8) [22] |
0x41cda4 ADD $0x4,%RBX |
0x41cda8 CMP %RDI,%RBX |
0x41cdab JB 41ca50 |
/scratch_na/users/xoserete/qaas_runs/171-417-8059/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h: 303 - 338 |
-------------------------------------------------------------------------------- |
303: for (int j = 0; j < iCount; j++) |
304: { |
305: real_type r = distArrayCompressed[j]; |
306: int iScatter = distIndices[j]; |
307: real_type rinv = cOne / r; |
308: r *= DeltaRInv; |
309: int iGather = (int)r; |
310: real_type t = r - real_type(iGather); |
311: real_type tp0 = t * t * t; |
312: real_type tp1 = t * t; |
313: real_type tp2 = t; |
314: |
315: real_type sCoef0 = SplineCoefs[iGather + 0]; |
316: real_type sCoef1 = SplineCoefs[iGather + 1]; |
317: real_type sCoef2 = SplineCoefs[iGather + 2]; |
318: real_type sCoef3 = SplineCoefs[iGather + 3]; |
319: |
320: // clang-format off |
321: laplArray[iScatter] = dSquareDeltaRinv * |
322: (sCoef0*( d2A[ 2]*tp2 + d2A[ 3])+ |
323: sCoef1*( d2A[ 6]*tp2 + d2A[ 7])+ |
324: sCoef2*( d2A[10]*tp2 + d2A[11])+ |
325: sCoef3*( d2A[14]*tp2 + d2A[15])); |
326: |
327: gradArray[iScatter] = DeltaRInv * rinv * |
328: (sCoef0*( dA[ 1]*tp1 + dA[ 2]*tp2 + dA[ 3])+ |
329: sCoef1*( dA[ 5]*tp1 + dA[ 6]*tp2 + dA[ 7])+ |
330: sCoef2*( dA[ 9]*tp1 + dA[10]*tp2 + dA[11])+ |
331: sCoef3*( dA[13]*tp1 + dA[14]*tp2 + dA[15])); |
332: |
333: valArray[iScatter] = (sCoef0*(A[ 0]*tp0 + A[ 1]*tp1 + A[ 2]*tp2 + A[ 3])+ |
334: sCoef1*(A[ 4]*tp0 + A[ 5]*tp1 + A[ 6]*tp2 + A[ 7])+ |
335: sCoef2*(A[ 8]*tp0 + A[ 9]*tp1 + A[10]*tp2 + A[11])+ |
336: sCoef3*(A[12]*tp0 + A[13]*tp1 + A[14]*tp2 + A[15])); |
337: // clang-format on |
338: } |
/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/bits/stl_vector.h: 951 - 951 |
-------------------------------------------------------------------------------- |
951: return *(this->_M_impl._M_start + __n); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►61.47+ | miniqmcreference::TwoBodyJastr[...] | TwoBodyJastrowRef.h:274 | exec |
○ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:201 | exec |
○ | main.extracted.110 | refwrap.h:313 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so | |
►34.81+ | miniqmcreference::TwoBodyJastr[...] | TwoBodyJastrowRef.h:274 | exec |
○ | qmcplusplus::WaveFunction::acc[...] | NewTimer.h:249 | exec |
○ | main.extracted.110 | refwrap.h:313 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so | |
►3.59+ | miniqmcreference::TwoBodyJastr[...] | TwoBodyJastrowRef.h:274 | exec |
○ | miniqmcreference::TwoBodyJastr[...] | TwoBodyJastrowRef.h:411 | exec |
○ | qmcplusplus::WaveFunction::eva[...] | WaveFunction.cpp:175 | exec |
○ | main.extracted.113 | miniqmc.cpp:397 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.44 |
CQA speedup if fully vectorized | 2.72 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.01 |
Bottlenecks | P0, |
Function | qmcplusplus::BsplineFunctor |
Source | BsplineFunctor.h:303-338,stl_vector.h:951-951 |
Source loop unroll info | unrolled by 4 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 4 |
CQA cycles | 34.00 |
CQA cycles if no scalar integer | 34.00 |
CQA cycles if FP arith vectorized | 23.58 |
CQA cycles if fully vectorized | 12.50 |
Front-end cycles | 25.00 |
DIV/SQRT cycles | 34.00 |
P0 cycles | 33.50 |
P1 cycles | 18.67 |
P2 cycles | 18.67 |
P3 cycles | 6.00 |
P4 cycles | 28.50 |
P5 cycles | 2.60 |
P6 cycles | 6.00 |
P7 cycles | 6.00 |
P8 cycles | 6.00 |
P9 cycles | 2.40 |
P10 cycles | 18.67 |
P11 cycles | 8.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 44.04 - 45.41 |
Stall cycles (UFS) | 18.61 - 19.95 |
Nb insns | 141.00 |
Nb uops | 150.00 |
Nb loads | 56.00 |
Nb stores | 12.00 |
Nb stack references | 35.00 |
FLOP/cycle | 8.71 |
Nb FLOP add-sub | 4.00 |
Nb FLOP mul | 24.00 |
Nb FLOP fma | 132.00 |
Nb FLOP div | 4.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 40.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 1272.00 |
Bytes stored | 96.00 |
Stride 0 | 2.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 16.00 |
Stride indirect | 2.00 |
Vectorization ratio all | 64.34 |
Vectorization ratio load | 68.63 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 59.09 |
Vector-efficiency ratio all | 34.30 |
Vector-efficiency ratio load | 38.24 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 50.00 |
Vector-efficiency ratio add_sub | 50.00 |
Vector-efficiency ratio fma | 50.00 |
Vector-efficiency ratio div_sqrt | 50.00 |
Vector-efficiency ratio other | 27.84 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.44 |
CQA speedup if fully vectorized | 2.72 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.01 |
Bottlenecks | P0, |
Function | qmcplusplus::BsplineFunctor |
Source | BsplineFunctor.h:303-338,stl_vector.h:951-951 |
Source loop unroll info | unrolled by 4 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 4 |
CQA cycles | 34.00 |
CQA cycles if no scalar integer | 34.00 |
CQA cycles if FP arith vectorized | 23.58 |
CQA cycles if fully vectorized | 12.50 |
Front-end cycles | 25.00 |
DIV/SQRT cycles | 34.00 |
P0 cycles | 33.50 |
P1 cycles | 18.67 |
P2 cycles | 18.67 |
P3 cycles | 6.00 |
P4 cycles | 28.50 |
P5 cycles | 2.60 |
P6 cycles | 6.00 |
P7 cycles | 6.00 |
P8 cycles | 6.00 |
P9 cycles | 2.40 |
P10 cycles | 18.67 |
P11 cycles | 8.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | 44.04 - 45.41 |
Stall cycles (UFS) | 18.61 - 19.95 |
Nb insns | 141.00 |
Nb uops | 150.00 |
Nb loads | 56.00 |
Nb stores | 12.00 |
Nb stack references | 35.00 |
FLOP/cycle | 8.71 |
Nb FLOP add-sub | 4.00 |
Nb FLOP mul | 24.00 |
Nb FLOP fma | 132.00 |
Nb FLOP div | 4.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 40.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 1272.00 |
Bytes stored | 96.00 |
Stride 0 | 2.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 16.00 |
Stride indirect | 2.00 |
Vectorization ratio all | 64.34 |
Vectorization ratio load | 68.63 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 59.09 |
Vector-efficiency ratio all | 34.30 |
Vector-efficiency ratio load | 38.24 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 50.00 |
Vector-efficiency ratio add_sub | 50.00 |
Vector-efficiency ratio fma | 50.00 |
Vector-efficiency ratio div_sqrt | 50.00 |
Vector-efficiency ratio other | 27.84 |
Path / |
Function | qmcplusplus::BsplineFunctor |
Source file and lines | BsplineFunctor.h:303-338 |
Module | exec |
nb instructions | 141 |
nb uops | 150 |
loop length | 865 |
used x86 registers | 16 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 16 |
used zmm registers | 0 |
nb stack references | 35 |
ADD-SUB / MUL ratio | 0.17 |
micro-operation queue | 25.00 cycles |
front end | 25.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 34.00 | 33.50 | 18.67 | 18.67 | 6.00 | 28.50 | 2.60 | 6.00 | 6.00 | 6.00 | 2.40 | 18.67 |
cycles | 34.00 | 33.50 | 18.67 | 18.67 | 6.00 | 28.50 | 2.60 | 6.00 | 6.00 | 6.00 | 2.40 | 18.67 |
Cycles executing div or sqrt instructions | 8.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 44.04-45.41 |
Stall cycles | 18.61-19.95 |
LM full (events) | 25.60-27.32 |
Front-end | 25.00 |
Dispatch | 34.00 |
DIV/SQRT | 8.00 |
Data deps. | 1.00 |
Overall L1 | 34.00 |
all | 35% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 25% |
all | 72% |
load | 68% |
store | 0% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 64% |
load | 68% |
store | 0% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 59% |
all | 21% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 16% |
all | 37% |
load | 38% |
store | 12% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | 50% |
other | 41% |
all | 34% |
load | 38% |
store | 12% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | 50% |
other | 27% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%R8,%RBX,8),%YMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMULPD %YMM15,%YMM10,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCVTTPD2DQ %YMM2,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 |
VPMOVSXDQ %XMM1,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSLLQ $0x3,%YMM1,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 0.50 |
VMOVQ %RSI,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPBROADCASTQ %XMM4,%YMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VROUNDPD $0xb,%YMM2,%YMM5 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VPADDQ %YMM1,%YMM4,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVQ %XMM1,%R14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VEXTRACTI128 $0x1,%YMM1,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VSUBPD %YMM5,%YMM2,%YMM4 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVQ %XMM8,%R15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD (%R15),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPEXTRQ $0x1,%XMM1,%R15 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVSD (%R14),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPADDQ %YMM6,%YMM1,%YMM7 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPEXTRQ $0x1,%XMM8,%R14 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVQ %XMM7,%R12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VEXTRACTI128 $0x1,%YMM7,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPEXTRQ $0x1,%XMM7,%R13 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VPEXTRQ $0x1,%XMM8,%R10 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVHPD (%R15),%XMM5,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVQ %XMM8,%R15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD (%R15),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R14),%XMM2,%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVHPD (%R10),%XMM5,%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD (%R12),%XMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VINSERTF128 $0x1,%XMM2,%YMM7,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVHPD (%R13),%XMM8,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVAPD %YMM4,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %YMM4,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x540(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x560(%RSP),%YMM0,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VINSERTF128 $0x1,%XMM5,%YMM7,%YMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFMADD213PD 0x520(%RSP),%YMM4,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x620(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD213PD 0x40(%RSP),%YMM0,%YMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVUPD 0x640(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x660(%RSP),%YMM0,%YMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMULPD %YMM2,%YMM5,%YMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %YMM5,%YMM7,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x4e0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x500(%RSP),%YMM0,%YMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x4c0(%RSP),%YMM4,%YMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMULPD %YMM2,%YMM9,%YMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %YMM9,%YMM7,%YMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x3c0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x3e0(%RSP),%YMM0,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x3a0(%RSP),%YMM4,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x380(%RSP),%YMM4,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMULPD %YMM2,%YMM9,%YMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x340(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x360(%RSP),%YMM0,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x320(%RSP),%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x300(%RSP),%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM9,%YMM7,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPADDQ %YMM3,%YMM1,%YMM7 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPEXTRQ $0x1,%XMM7,%R10 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVQ %XMM7,%R14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VEXTRACTI128 $0x1,%YMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPEXTRQ $0x1,%XMM7,%R15 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVQ %XMM7,%R12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD (%R12),%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPADDQ %YMM1,%YMM13,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVQ %XMM1,%R12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPEXTRQ $0x1,%XMM1,%R13 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVHPD (%R15),%XMM7,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VEXTRACTI128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVQ %XMM1,%R15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPEXTRQ $0x1,%XMM1,%R8 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVSD (%R14),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R10),%XMM1,%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM7,%YMM1,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %YMM4,%YMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x5e0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x600(%RSP),%YMM0,%YMM7 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM8,%YMM9,%YMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x480(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x4a0(%RSP),%YMM0,%YMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x460(%RSP),%YMM4,%YMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM5,%YMM9,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x2c0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x2e0(%RSP),%YMM0,%YMM1 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x2a0(%RSP),%YMM4,%YMM1 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x280(%RSP),%YMM4,%YMM1 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM2,%YMM9,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD (%R15),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R8),%XMM2,%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD (%R12),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R13),%XMM5,%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM2,%YMM5,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %YMM4,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x420(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x440(%RSP),%YMM0,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x400(%RSP),%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM8,%YMM5,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %YMM2,%YMM15,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %YMM10,%YMM2,%YMM8 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 8 |
VMOVAPD %YMM4,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFMADD132PD 0x260(%RSP),%YMM12,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM14,%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %YMM11,%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x5a0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x5c0(%RSP),%YMM0,%YMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM7,%YMM5,%YMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVSXD (%RAX,%RBX,4),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x4(%RAX,%RBX,4),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x8(%RAX,%RBX,4),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0xc(%RAX,%RBX,4),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULPD 0x580(%RSP),%YMM4,%YMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD %R11,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVLPD %XMM4,(%RCX,%R8,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
ADD %R11,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVHPD %XMM4,(%RCX,%R10,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VEXTRACTF128 $0x1,%YMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R11,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R11,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVLPD %XMM4,(%RCX,%R14,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVHPD %XMM4,(%RCX,%R15,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVLPD %XMM8,(%RDX,%R8,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VFMADD213PD %YMM1,%YMM5,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVHPD %XMM8,(%RDX,%R10,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VEXTRACTF128 $0x1,%YMM8,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVLPD %XMM1,(%RDX,%R14,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVHPD %XMM1,(%RDX,%R15,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVLPD %XMM2,(%R9,%R8,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
MOV 0x20(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD %XMM2,(%R9,%R10,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VEXTRACTF128 $0x1,%YMM2,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVLPD %XMM1,(%R9,%R14,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVHPD %XMM1,(%R9,%R15,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
ADD $0x4,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %RDI,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 41ca50 <_ZNK11qmcplusplus14BsplineFunctorIdE11evaluateVGLEiiiPKdPdS4_S4_S4_Pi+0x4c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | qmcplusplus::BsplineFunctor |
Source file and lines | BsplineFunctor.h:303-338 |
Module | exec |
nb instructions | 141 |
nb uops | 150 |
loop length | 865 |
used x86 registers | 16 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 16 |
used zmm registers | 0 |
nb stack references | 35 |
ADD-SUB / MUL ratio | 0.17 |
micro-operation queue | 25.00 cycles |
front end | 25.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 34.00 | 33.50 | 18.67 | 18.67 | 6.00 | 28.50 | 2.60 | 6.00 | 6.00 | 6.00 | 2.40 | 18.67 |
cycles | 34.00 | 33.50 | 18.67 | 18.67 | 6.00 | 28.50 | 2.60 | 6.00 | 6.00 | 6.00 | 2.40 | 18.67 |
Cycles executing div or sqrt instructions | 8.00 |
Longest recurrence chain latency (RecMII) | 1.00 |
FE+BE cycles | 44.04-45.41 |
Stall cycles | 18.61-19.95 |
LM full (events) | 25.60-27.32 |
Front-end | 25.00 |
Dispatch | 34.00 |
DIV/SQRT | 8.00 |
Data deps. | 1.00 |
Overall L1 | 34.00 |
all | 35% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 25% |
all | 72% |
load | 68% |
store | 0% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 64% |
load | 68% |
store | 0% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 59% |
all | 21% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 16% |
all | 37% |
load | 38% |
store | 12% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | 50% |
other | 41% |
all | 34% |
load | 38% |
store | 12% |
mul | 50% |
add-sub | 50% |
fma | 50% |
div/sqrt | 50% |
other | 27% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVUPD (%R8,%RBX,8),%YMM10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VMULPD %YMM15,%YMM10,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCVTTPD2DQ %YMM2,%XMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 1 |
VPMOVSXDQ %XMM1,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSLLQ $0x3,%YMM1,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 0.50 |
VMOVQ %RSI,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPBROADCASTQ %XMM4,%YMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VROUNDPD $0xb,%YMM2,%YMM5 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VPADDQ %YMM1,%YMM4,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVQ %XMM1,%R14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VEXTRACTI128 $0x1,%YMM1,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VSUBPD %YMM5,%YMM2,%YMM4 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMOVQ %XMM8,%R15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD (%R15),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPEXTRQ $0x1,%XMM1,%R15 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVSD (%R14),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPADDQ %YMM6,%YMM1,%YMM7 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPEXTRQ $0x1,%XMM8,%R14 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVQ %XMM7,%R12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VEXTRACTI128 $0x1,%YMM7,%XMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPEXTRQ $0x1,%XMM7,%R13 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VPEXTRQ $0x1,%XMM8,%R10 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVHPD (%R15),%XMM5,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVQ %XMM8,%R15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD (%R15),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R14),%XMM2,%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVHPD (%R10),%XMM5,%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD (%R12),%XMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VINSERTF128 $0x1,%XMM2,%YMM7,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVHPD (%R13),%XMM8,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVAPD %YMM4,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %YMM4,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x540(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x560(%RSP),%YMM0,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VINSERTF128 $0x1,%XMM5,%YMM7,%YMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFMADD213PD 0x520(%RSP),%YMM4,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x620(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD213PD 0x40(%RSP),%YMM0,%YMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMOVUPD 0x640(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x660(%RSP),%YMM0,%YMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMULPD %YMM2,%YMM5,%YMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %YMM5,%YMM7,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x4e0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x500(%RSP),%YMM0,%YMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x4c0(%RSP),%YMM4,%YMM5 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMULPD %YMM2,%YMM9,%YMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %YMM9,%YMM7,%YMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x3c0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x3e0(%RSP),%YMM0,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x3a0(%RSP),%YMM4,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x380(%RSP),%YMM4,%YMM9 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VMULPD %YMM2,%YMM9,%YMM9 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x340(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x360(%RSP),%YMM0,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x320(%RSP),%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x300(%RSP),%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM9,%YMM7,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPADDQ %YMM3,%YMM1,%YMM7 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VPEXTRQ $0x1,%XMM7,%R10 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVQ %XMM7,%R14 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VEXTRACTI128 $0x1,%YMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPEXTRQ $0x1,%XMM7,%R15 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVQ %XMM7,%R12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVSD (%R12),%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VPADDQ %YMM1,%YMM13,%YMM1 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
VMOVQ %XMM1,%R12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPEXTRQ $0x1,%XMM1,%R13 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVHPD (%R15),%XMM7,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VEXTRACTI128 $0x1,%YMM1,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVQ %XMM1,%R15 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPEXTRQ $0x1,%XMM1,%R8 | 2 | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VMOVSD (%R14),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R10),%XMM1,%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM7,%YMM1,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %YMM4,%YMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x5e0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x600(%RSP),%YMM0,%YMM7 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM8,%YMM9,%YMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x480(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x4a0(%RSP),%YMM0,%YMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x460(%RSP),%YMM4,%YMM8 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM5,%YMM9,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD %YMM4,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x2c0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x2e0(%RSP),%YMM0,%YMM1 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x2a0(%RSP),%YMM4,%YMM1 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x280(%RSP),%YMM4,%YMM1 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM2,%YMM9,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD (%R15),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R8),%XMM2,%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD (%R12),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD (%R13),%XMM5,%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM2,%YMM5,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %YMM4,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVUPD 0x420(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x440(%RSP),%YMM0,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD 0x400(%RSP),%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM8,%YMM5,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %YMM2,%YMM15,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %YMM10,%YMM2,%YMM8 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 8 |
VMOVAPD %YMM4,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VFMADD132PD 0x260(%RSP),%YMM12,%YMM2 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM14,%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %YMM11,%YMM4,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD 0x5a0(%RSP),%YMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VFMADD132PD 0x5c0(%RSP),%YMM0,%YMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VFMADD213PD %YMM7,%YMM5,%YMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOVSXD (%RAX,%RBX,4),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x4(%RAX,%RBX,4),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x8(%RAX,%RBX,4),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0xc(%RAX,%RBX,4),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMULPD 0x580(%RSP),%YMM4,%YMM4 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
ADD %R11,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVLPD %XMM4,(%RCX,%R8,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
ADD %R11,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVHPD %XMM4,(%RCX,%R10,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VEXTRACTF128 $0x1,%YMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
ADD %R11,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R11,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVLPD %XMM4,(%RCX,%R14,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVHPD %XMM4,(%RCX,%R15,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVLPD %XMM8,(%RDX,%R8,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VFMADD213PD %YMM1,%YMM5,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVHPD %XMM8,(%RDX,%R10,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VEXTRACTF128 $0x1,%YMM8,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVLPD %XMM1,(%RDX,%R14,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVHPD %XMM1,(%RDX,%R15,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVLPD %XMM2,(%R9,%R8,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
MOV 0x20(%RBP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD %XMM2,(%R9,%R10,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VEXTRACTF128 $0x1,%YMM2,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVLPD %XMM1,(%R9,%R14,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
VMOVHPD %XMM1,(%R9,%R15,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 4-12 | 0.50 |
ADD $0x4,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %RDI,%RBX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 41ca50 <_ZNK11qmcplusplus14BsplineFunctorIdE11evaluateVGLEiiiPKdPdS4_S4_S4_Pi+0x4c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |