Loop Id: 223 | Module: exec | Source: OneBodyJastrowRef.h:134-155 [...] | Coverage: 0.01% |
---|
Loop Id: 223 | Module: exec | Source: OneBodyJastrowRef.h:134-155 [...] | Coverage: 0.01% |
---|
0x415e5d VXORPD %XMM3,%XMM3,%XMM3 |
0x415e61 NOPW %CS:(%RAX,%RAX,1) |
0x415e70 VMOVSD -0x48(%RBP),%XMM0 |
0x415e75 VSUBSD %XMM3,%XMM0,%XMM0 |
0x415e79 CALL 4caa90 <exp> |
0x415e7e MOV -0x40(%RBP),%RCX |
0x415e82 MOV (%RCX),%RAX |
0x415e85 MOV -0x50(%RBP),%RBX |
0x415e89 VMOVSD %XMM0,(%RAX,%RBX,8) |
0x415e8e INC %RBX |
0x415e91 MOV 0x8(%RCX),%RCX |
0x415e95 SUB %RAX,%RCX |
0x415e98 SAR $0x3,%RCX |
0x415e9c CMP %RBX,%RCX |
0x415e9f JBE 4160a4 |
0x415ea5 MOV -0x38(%RBP),%RDI |
0x415ea9 MOVSXD 0x2a0(%RDI),%RAX |
0x415eb0 MOV 0xf0(%R15),%RCX |
0x415eb7 VMOVSD (%RCX,%RAX,8),%XMM0 |
0x415ebc VMOVSD %XMM0,-0x48(%RBP) |
0x415ec1 MOV 0xa8(%R15),%ESI |
0x415ec8 CALL 453d50 <_ZNK11qmcplusplus11ParticleSet14getDistTableABEi> |
0x415ecd MOV 0x48(%RAX),%RAX |
0x415ed1 LEA (%RBX,%RBX,4),%RCX |
0x415ed5 MOV 0x18(%RAX,%RCX,8),%R12 |
0x415eda MOV 0x98(%R15),%EAX |
0x415ee1 TEST %EAX,%EAX |
0x415ee3 MOV %RBX,-0x50(%RBP) |
0x415ee7 JLE 415f70 |
0x415eed VXORPD %XMM3,%XMM3,%XMM3 |
0x415ef1 XOR %EBX,%EBX |
0x415ef3 JMP 415f0f |
(225) 0x415f00 MOVSXD %EAX,%RCX |
(225) 0x415f03 INC %RBX |
(225) 0x415f06 CMP %RCX,%RBX |
(225) 0x415f09 JGE 415e70 |
(225) 0x415f0f MOV 0x1c8(%R15),%RCX |
(225) 0x415f16 MOV (%RCX,%RBX,8),%RDI |
(225) 0x415f1a TEST %RDI,%RDI |
(225) 0x415f1d JE 415f00 |
(225) 0x415f1f MOV 0xa0(%R15),%RAX |
(225) 0x415f26 MOV 0x148(%R15),%R9 |
(225) 0x415f2d MOV 0x268(%RAX),%RAX |
(225) 0x415f34 MOV 0x18(%RAX),%RAX |
(225) 0x415f38 MOV (%RAX,%RBX,4),%EDX |
(225) 0x415f3b MOV 0x4(%RAX,%RBX,4),%ECX |
(225) 0x415f3f MOV $-0x1,%ESI |
(225) 0x415f44 MOV %R12,%R8 |
(225) 0x415f47 VMOVSD %XMM3,-0x30(%RBP) |
(225) 0x415f4c CALL 417f60 <_ZNK11qmcplusplus14BsplineFunctorIdE9evaluateVEiiiPKdPd> |
(225) 0x415f51 VMOVSD -0x30(%RBP),%XMM3 |
(225) 0x415f56 VADDSD %XMM3,%XMM0,%XMM3 |
(225) 0x415f5a MOV 0x98(%R15),%EAX |
(225) 0x415f61 JMP 415f00 |
0x415f70 MOV 0x90(%R15),%ECX |
0x415f77 TEST %ECX,%ECX |
0x415f79 JLE 415e5d |
0x415f7f MOV 0xa0(%R15),%RAX |
0x415f86 MOV 0x1c8(%R15),%RDX |
0x415f8d MOV 0x18(%RAX),%R13 |
0x415f91 VXORPD %XMM3,%XMM3,%XMM3 |
0x415f95 XOR %EBX,%EBX |
0x415f97 MOV %RCX,-0x60(%RBP) |
0x415f9b MOV %R12,-0x58(%RBP) |
0x415f9f JMP 415fc0 |
(224) 0x415fb0 VADDSD %XMM3,%XMM1,%XMM3 |
(224) 0x415fb4 INC %RBX |
(224) 0x415fb7 CMP %RBX,%RCX |
(224) 0x415fba JE 415e70 |
(224) 0x415fc0 MOVSXD (%R13,%RBX,4),%RAX |
(224) 0x415fc5 MOV (%RDX,%RAX,8),%R14 |
(224) 0x415fc9 TEST %R14,%R14 |
(224) 0x415fcc JE 415fb4 |
(224) 0x415fce VMOVSD (%R12,%RBX,8),%XMM0 |
(224) 0x415fd4 VMOVSD 0x8(%R14),%XMM2 |
(224) 0x415fda VXORPD %XMM1,%XMM1,%XMM1 |
(224) 0x415fde VUCOMISD %XMM0,%XMM2 |
(224) 0x415fe2 JBE 415fb0 |
(224) 0x415fe4 VMULSD 0x238(%R14),%XMM0,%XMM0 |
(224) 0x415fed LEA -0x68(%RBP),%RDI |
(224) 0x415ff1 VMOVSD %XMM3,-0x30(%RBP) |
(224) 0x415ff6 MOV %RDX,%R12 |
(224) 0x415ff9 CALL 4cab20 <modf> |
(224) 0x415ffe MOV %R12,%RDX |
(224) 0x416001 MOV -0x58(%RBP),%R12 |
(224) 0x416005 VCVTTSD2SI -0x68(%RBP),%EAX |
(224) 0x41600a CLTQ |
(224) 0x41600c VMOVSD 0x20(%R14),%XMM1 |
(224) 0x416012 VFMADD231SD 0x18(%R14),%XMM0,%XMM1 |
(224) 0x416018 VFMADD213SD 0x28(%R14),%XMM0,%XMM1 |
(224) 0x41601e VFMADD213SD 0x30(%R14),%XMM0,%XMM1 |
(224) 0x416024 MOV 0x218(%R14),%RCX |
(224) 0x41602b VMULSD (%RCX,%RAX,8),%XMM1,%XMM1 |
(224) 0x416030 VMOVSD 0x40(%R14),%XMM2 |
(224) 0x416036 VFMADD231SD 0x38(%R14),%XMM0,%XMM2 |
(224) 0x41603c VFMADD213SD 0x48(%R14),%XMM0,%XMM2 |
(224) 0x416042 VFMADD213SD 0x50(%R14),%XMM0,%XMM2 |
(224) 0x416048 VFMADD132SD 0x8(%RCX,%RAX,8),%XMM1,%XMM2 |
(224) 0x41604f VMOVSD 0x60(%R14),%XMM3 |
(224) 0x416055 VFMADD231SD 0x58(%R14),%XMM0,%XMM3 |
(224) 0x41605b VFMADD213SD 0x68(%R14),%XMM0,%XMM3 |
(224) 0x416061 VFMADD213SD 0x70(%R14),%XMM0,%XMM3 |
(224) 0x416067 VFMADD132SD 0x10(%RCX,%RAX,8),%XMM2,%XMM3 |
(224) 0x41606e VMOVSD 0x80(%R14),%XMM1 |
(224) 0x416077 VFMADD231SD 0x78(%R14),%XMM0,%XMM1 |
(224) 0x41607d VFMADD213SD 0x88(%R14),%XMM0,%XMM1 |
(224) 0x416086 VFMADD213SD 0x90(%R14),%XMM0,%XMM1 |
(224) 0x41608f VFMADD132SD 0x18(%RCX,%RAX,8),%XMM3,%XMM1 |
(224) 0x416096 MOV -0x60(%RBP),%RCX |
(224) 0x41609a VMOVSD -0x30(%RBP),%XMM3 |
(224) 0x41609f JMP 415fb0 |
/scratch_na/users/xoserete/qaas_runs/171-284-5201/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/OneBodyJastrowRef.h: 134 - 155 |
-------------------------------------------------------------------------------- |
134: for (int k = 0; k < ratios.size(); ++k) |
135: ratios[k] = std::exp(Vat[VP.refPtcl] - computeU(VP.getDistTableAB(myTableID).getDistRow(k).data())); |
[...] |
141: if (NumGroups > 0) |
142: { |
143: for (int jg = 0; jg < NumGroups; ++jg) |
144: { |
145: if (F[jg] != nullptr) |
146: curVat += F[jg]->evaluateV(-1, Ions.first(jg), Ions.last(jg), dist, DistCompressed.data()); |
147: } |
148: } |
149: else |
150: { |
151: for (int c = 0; c < Nions; ++c) |
152: { |
153: int gid = Ions.GroupID[c]; |
154: if (F[gid] != nullptr) |
155: curVat += F[gid]->evaluate(dist[c]); |
/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/bits/shared_ptr_base.h: 1308 - 1308 |
-------------------------------------------------------------------------------- |
1308: { return _M_ptr; } |
/scratch_na/users/xoserete/qaas_runs/171-284-5201/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/OhmmsVector.h: 223 - 249 |
-------------------------------------------------------------------------------- |
223: return X[i]; |
[...] |
229: return X[i]; |
[...] |
249: inline const_pointer data() const { return X; } |
/scratch_na/users/xoserete/qaas_runs/171-284-5201/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h: 166 - 181 |
-------------------------------------------------------------------------------- |
166: if (r >= cutoff_radius) |
167: return 0.0; |
168: r *= DeltaRInv; |
169: real_type ipart, t; |
170: t = std::modf(r, &ipart); |
171: int i = (int)ipart; |
[...] |
179: (SplineCoefs[i+0]*(A[ 0]*tp[0] + A[ 1]*tp[1] + A[ 2]*tp[2] + A[ 3]*tp[3])+ |
180: SplineCoefs[i+1]*(A[ 4]*tp[0] + A[ 5]*tp[1] + A[ 6]*tp[2] + A[ 7]*tp[3])+ |
181: SplineCoefs[i+2]*(A[ 8]*tp[0] + A[ 9]*tp[1] + A[10]*tp[2] + A[11]*tp[3])+ |
/scratch_na/users/xoserete/qaas_runs/171-284-5201/intel/miniqmc/build/miniqmc/src/Particle/ParticleSet.h: 313 - 316 |
-------------------------------------------------------------------------------- |
313: inline int first(int igroup) const { return (*group_offsets_)[igroup]; } |
314: |
315: ///return the last index of a group i |
316: inline int last(int igroup) const { return (*group_offsets_)[igroup + 1]; } |
/usr/lib/gcc/x86_64-redhat-linux/8/../../../../include/c++/8/bits/stl_vector.h: 806 - 1056 |
-------------------------------------------------------------------------------- |
806: { return size_type(this->_M_impl._M_finish - this->_M_impl._M_start); } |
[...] |
933: return *(this->_M_impl._M_start + __n); |
[...] |
951: return *(this->_M_impl._M_start + __n); |
[...] |
1056: { return _M_data_ptr(this->_M_impl._M_start); } |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.21 |
CQA speedup if FP arith vectorized | 2.57 |
CQA speedup if fully vectorized | 12.51 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.32 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::OneBodyJastrowRef |
Source | OneBodyJastrowRef.h:134-135,OneBodyJastrowRef.h:141-141,OneBodyJastrowRef.h:151-153,OhmmsVector.h:223-223,OhmmsVector.h:229-229,OhmmsVector.h:249-249,stl_vector.h:806-806,stl_vector.h:933-933,stl_vector.h:951-951 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 7.50 |
CQA cycles if no scalar integer | 2.33 |
CQA cycles if FP arith vectorized | 2.92 |
CQA cycles if fully vectorized | 0.60 |
Front-end cycles | 7.50 |
DIV/SQRT cycles | 2.00 |
P0 cycles | 1.70 |
P1 cycles | 5.67 |
P2 cycles | 5.67 |
P3 cycles | 3.50 |
P4 cycles | 1.70 |
P5 cycles | 2.00 |
P6 cycles | 3.50 |
P7 cycles | 3.50 |
P8 cycles | 3.50 |
P9 cycles | 1.60 |
P10 cycles | 5.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 7.40 |
Stall cycles (UFS) | 0.00 |
Nb insns | 43.00 |
Nb uops | 45.00 |
Nb loads | 17.00 |
Nb stores | 5.00 |
Nb stack references | 6.00 |
FLOP/cycle | 0.13 |
Nb FLOP add-sub | 1.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 21.33 |
Bytes prefetched | 0.00 |
Bytes loaded | 120.00 |
Bytes stored | 40.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 17.65 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 60.00 |
Vector-efficiency ratio all | 13.97 |
Vector-efficiency ratio load | 11.46 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 18.75 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.21 |
CQA speedup if FP arith vectorized | 2.57 |
CQA speedup if fully vectorized | 12.51 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.32 |
Bottlenecks | micro-operation queue, |
Function | miniqmcreference::OneBodyJastrowRef |
Source | OneBodyJastrowRef.h:134-135,OneBodyJastrowRef.h:141-141,OneBodyJastrowRef.h:151-153,OhmmsVector.h:223-223,OhmmsVector.h:229-229,OhmmsVector.h:249-249,stl_vector.h:806-806,stl_vector.h:933-933,stl_vector.h:951-951 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 7.50 |
CQA cycles if no scalar integer | 2.33 |
CQA cycles if FP arith vectorized | 2.92 |
CQA cycles if fully vectorized | 0.60 |
Front-end cycles | 7.50 |
DIV/SQRT cycles | 2.00 |
P0 cycles | 1.70 |
P1 cycles | 5.67 |
P2 cycles | 5.67 |
P3 cycles | 3.50 |
P4 cycles | 1.70 |
P5 cycles | 2.00 |
P6 cycles | 3.50 |
P7 cycles | 3.50 |
P8 cycles | 3.50 |
P9 cycles | 1.60 |
P10 cycles | 5.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 7.40 |
Stall cycles (UFS) | 0.00 |
Nb insns | 43.00 |
Nb uops | 45.00 |
Nb loads | 17.00 |
Nb stores | 5.00 |
Nb stack references | 6.00 |
FLOP/cycle | 0.13 |
Nb FLOP add-sub | 1.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 21.33 |
Bytes prefetched | 0.00 |
Bytes loaded | 120.00 |
Bytes stored | 40.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 17.65 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 60.00 |
Vector-efficiency ratio all | 13.97 |
Vector-efficiency ratio load | 11.46 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 18.75 |
Path / |
Function | miniqmcreference::OneBodyJastrowRef |
Source file and lines | OneBodyJastrowRef.h:134-155 |
Module | exec |
nb instructions | 43 |
nb uops | 45 |
loop length | 201 |
used x86 registers | 10 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 6 |
micro-operation queue | 7.50 cycles |
front end | 7.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.00 | 1.70 | 5.67 | 5.67 | 3.50 | 1.70 | 2.00 | 3.50 | 3.50 | 3.50 | 1.60 | 5.67 |
cycles | 2.00 | 1.70 | 5.67 | 5.67 | 3.50 | 1.70 | 2.00 | 3.50 | 3.50 | 3.50 | 1.60 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 7.40 |
Stall cycles | 0.00 |
LM full (events) | 0.01 |
Front-end | 7.50 |
Dispatch | 5.67 |
Overall L1 | 7.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 37% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 17% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 11% |
load | 10% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 17% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 13% |
load | 11% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD -0x48(%RBP),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VSUBSD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
CALL 4caa90 <exp> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x50(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM0,(%RAX,%RBX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
INC %RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x8(%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAR $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
CMP %RBX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JBE 4160a4 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x274> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x2a0(%RDI),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xf0(%R15),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,8),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0xa8(%R15),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CALL 453d50 <_ZNK11qmcplusplus11ParticleSet14getDistTableABEi> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV 0x48(%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RBX,%RBX,4),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x18(%RAX,%RCX,8),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x98(%R15),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %EAX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
MOV %RBX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JLE 415f70 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x140> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 415f0f <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0xdf> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV 0x90(%R15),%ECX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %ECX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 415e5d <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x2d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0xa0(%R15),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x1c8(%R15),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x18(%RAX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RCX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R12,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 415fc0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x190> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
Function | miniqmcreference::OneBodyJastrowRef |
Source file and lines | OneBodyJastrowRef.h:134-155 |
Module | exec |
nb instructions | 43 |
nb uops | 45 |
loop length | 201 |
used x86 registers | 10 |
used mmx registers | 0 |
used xmm registers | 2 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 6 |
micro-operation queue | 7.50 cycles |
front end | 7.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.00 | 1.70 | 5.67 | 5.67 | 3.50 | 1.70 | 2.00 | 3.50 | 3.50 | 3.50 | 1.60 | 5.67 |
cycles | 2.00 | 1.70 | 5.67 | 5.67 | 3.50 | 1.70 | 2.00 | 3.50 | 3.50 | 3.50 | 1.60 | 5.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 7.40 |
Stall cycles | 0.00 |
LM full (events) | 0.01 |
Front-end | 7.50 |
Dispatch | 5.67 |
Overall L1 | 7.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 37% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 17% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 60% |
all | 11% |
load | 10% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 17% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 13% |
load | 11% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 18% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD -0x48(%RBP),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VSUBSD %XMM3,%XMM0,%XMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
CALL 4caa90 <exp> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV -0x40(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV (%RCX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV -0x50(%RBP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM0,(%RAX,%RBX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
INC %RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV 0x8(%RCX),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %RAX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAR $0x3,%RCX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
CMP %RBX,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JBE 4160a4 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x274> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV -0x38(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD 0x2a0(%RDI),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xf0(%R15),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD (%RCX,%RAX,8),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD %XMM0,-0x48(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0xa8(%R15),%ESI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CALL 453d50 <_ZNK11qmcplusplus11ParticleSet14getDistTableABEi> | 2 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 1 |
MOV 0x48(%RAX),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RBX,%RBX,4),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x18(%RAX,%RCX,8),%R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x98(%R15),%EAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %EAX,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
MOV %RBX,-0x50(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JLE 415f70 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x140> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 415f0f <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0xdf> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV 0x90(%R15),%ECX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %ECX,%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 415e5d <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x2d> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0xa0(%R15),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x1c8(%R15),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x18(%RAX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %RCX,-0x60(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R12,-0x58(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JMP 415fc0 <_ZN16miniqmcreference17OneBodyJastrowRefIN11qmcplusplus14BsplineFunctorIdEEE14evaluateRatiosERNS1_18VirtualParticleSetERSt6vectorIdSaIdEE+0x190> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |