Function: _ZN11qmcplusplus6SPOSet17evaluateDetRatiosERKNS_18VirtualParticleSetERNS_6VectorIdSaIdEEER ... | Module: libqmcwfs.so | Source: SPOSet.h:77-88 [...] | Coverage: 0.83% |
---|
Function: _ZN11qmcplusplus6SPOSet17evaluateDetRatiosERKNS_18VirtualParticleSetERNS_6VectorIdSaIdEEER ... | Module: libqmcwfs.so | Source: SPOSet.h:77-88 [...] | Coverage: 0.83% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-145-9236/intel/miniqmc/build/miniqmc/src/Platforms/CPU/SIMD/inner_product.hpp: 82 - 83 |
-------------------------------------------------------------------------------- |
82: for (int i = 0; i < n; i++) |
83: res += a[i] * b[i]; |
/home/eoseret/qaas_runs_CPU_9468/171-145-9236/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/SPOSet.h: 77 - 88 |
-------------------------------------------------------------------------------- |
77: virtual void evaluateDetRatios(const VirtualParticleSet& VP, |
[...] |
83: for (int iat = 0; iat < VP.getTotalNum(); ++iat) |
84: { |
85: evaluate(VP, iat, psi); |
86: ratios[iat] = simd::dot(psi.data(), psiinv.data(), psi.size()); |
87: } |
88: } |
/home/eoseret/qaas_runs_CPU_9468/171-145-9236/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/OhmmsVector.h: 178 - 249 |
-------------------------------------------------------------------------------- |
178: inline size_t size() const { return nLocal; } |
[...] |
248: inline pointer data() { return X; } |
249: inline const_pointer data() const { return X; } |
0xaf90 PUSH %RBP |
0xaf91 MOV %RSP,%RBP |
0xaf94 PUSH %R15 |
0xaf96 PUSH %R14 |
0xaf98 PUSH %R13 |
0xaf9a PUSH %R12 |
0xaf9c PUSH %RBX |
0xaf9d SUB $0x18,%RSP |
0xafa1 CMPQ $0,0x260(%RSI) |
0xafa9 MOV %RCX,-0x38(%RBP) |
0xafad JE b133 |
0xafb3 MOV %RDI,%R12 |
0xafb6 MOV %RSI,%RBX |
0xafb9 MOV %RDX,%R15 |
0xafbc MOV %R8,%R13 |
0xafbf XOR %R14D,%R14D |
0xafc2 NOPW (%RAX,%RAX,1) |
(43) 0xafc8 MOV (%R12),%RAX |
(43) 0xafcc MOV %R15,%RCX |
(43) 0xafcf MOV %R14D,%EDX |
(43) 0xafd2 MOV %R12,%RDI |
(43) 0xafd5 MOV %RBX,%RSI |
(43) 0xafd8 CALLQ 0x10(%RAX) |
(43) 0xafdb MOV -0x38(%RBP),%RDX |
(43) 0xafdf MOV 0x8(%R15),%R8 |
(43) 0xafe3 MOV 0x18(%R15),%RCX |
(43) 0xafe7 MOV 0x18(%RDX),%RDI |
(43) 0xafeb TEST %R8D,%R8D |
(43) 0xafee JLE b148 |
(43) 0xaff4 XOR %R9D,%R9D |
(43) 0xaff7 VXORPD %XMM0,%XMM0,%XMM0 |
(43) 0xaffb LEA -0x1(%R8),%ESI |
(43) 0xafff AND $0x7,%R8D |
(43) 0xb003 JE b09e |
(43) 0xb009 CMP $0x1,%R8 |
(43) 0xb00d JE b087 |
(43) 0xb00f CMP $0x2,%R8 |
(43) 0xb013 JE b078 |
(43) 0xb015 CMP $0x3,%R8 |
(43) 0xb019 JE b069 |
(43) 0xb01b CMP $0x4,%R8 |
(43) 0xb01f JE b05a |
(43) 0xb021 CMP $0x5,%R8 |
(43) 0xb025 JE b04b |
(43) 0xb027 CMP $0x6,%R8 |
(43) 0xb02b JE b03c |
(43) 0xb02d VMOVSD (%RCX),%XMM7 |
(43) 0xb031 VFMADD231SD (%RDI),%XMM7,%XMM0 |
(43) 0xb036 MOV $0x1,%R9D |
(43) 0xb03c VMOVSD (%RCX,%R9,8),%XMM1 |
(43) 0xb042 VFMADD231SD (%RDI,%R9,8),%XMM1,%XMM0 |
(43) 0xb048 INC %R9 |
(43) 0xb04b VMOVSD (%RCX,%R9,8),%XMM6 |
(43) 0xb051 VFMADD231SD (%RDI,%R9,8),%XMM6,%XMM0 |
(43) 0xb057 INC %R9 |
(43) 0xb05a VMOVSD (%RCX,%R9,8),%XMM5 |
(43) 0xb060 VFMADD231SD (%RDI,%R9,8),%XMM5,%XMM0 |
(43) 0xb066 INC %R9 |
(43) 0xb069 VMOVSD (%RCX,%R9,8),%XMM4 |
(43) 0xb06f VFMADD231SD (%RDI,%R9,8),%XMM4,%XMM0 |
(43) 0xb075 INC %R9 |
(43) 0xb078 VMOVSD (%RCX,%R9,8),%XMM3 |
(43) 0xb07e VFMADD231SD (%RDI,%R9,8),%XMM3,%XMM0 |
(43) 0xb084 INC %R9 |
(43) 0xb087 MOV %R9,%R10 |
(43) 0xb08a VMOVSD (%RCX,%R9,8),%XMM2 |
(43) 0xb090 VFMADD231SD (%RDI,%R9,8),%XMM2,%XMM0 |
(43) 0xb096 INC %R9 |
(43) 0xb099 CMP %R10,%RSI |
(43) 0xb09c JE b119 |
(44) 0xb09e VMOVSD (%RCX,%R9,8),%XMM8 |
(44) 0xb0a4 VFMADD231SD (%RDI,%R9,8),%XMM8,%XMM0 |
(44) 0xb0aa LEA 0x7(%R9),%R11 |
(44) 0xb0ae VMOVSD 0x8(%RCX,%R9,8),%XMM9 |
(44) 0xb0b5 VFMADD231SD 0x8(%RDI,%R9,8),%XMM9,%XMM0 |
(44) 0xb0bc VMOVSD 0x10(%RCX,%R9,8),%XMM10 |
(44) 0xb0c3 VFMADD231SD 0x10(%RDI,%R9,8),%XMM10,%XMM0 |
(44) 0xb0ca VMOVSD 0x18(%RCX,%R9,8),%XMM11 |
(44) 0xb0d1 VFMADD231SD 0x18(%RDI,%R9,8),%XMM11,%XMM0 |
(44) 0xb0d8 VMOVSD 0x20(%RCX,%R9,8),%XMM12 |
(44) 0xb0df VFMADD231SD 0x20(%RDI,%R9,8),%XMM12,%XMM0 |
(44) 0xb0e6 VMOVSD 0x28(%RCX,%R9,8),%XMM13 |
(44) 0xb0ed VFMADD231SD 0x28(%RDI,%R9,8),%XMM13,%XMM0 |
(44) 0xb0f4 VMOVSD 0x30(%RCX,%R9,8),%XMM14 |
(44) 0xb0fb VFMADD231SD 0x30(%RDI,%R9,8),%XMM14,%XMM0 |
(44) 0xb102 VMOVSD 0x38(%RCX,%R9,8),%XMM15 |
(44) 0xb109 VFMADD231SD 0x38(%RDI,%R9,8),%XMM15,%XMM0 |
(44) 0xb110 ADD $0x8,%R9 |
(44) 0xb114 CMP %R11,%RSI |
(44) 0xb117 JNE b09e |
(43) 0xb119 MOV (%R13),%RAX |
(43) 0xb11d VMOVSD %XMM0,(%RAX,%R14,8) |
(43) 0xb123 INC %R14 |
(43) 0xb126 CMP 0x260(%RBX),%R14 |
(43) 0xb12d JB afc8 |
0xb133 ADD $0x18,%RSP |
0xb137 POP %RBX |
0xb138 POP %R12 |
0xb13a POP %R13 |
0xb13c POP %R14 |
0xb13e POP %R15 |
0xb140 POP %RBP |
0xb141 RET |
0xb142 NOPW (%RAX,%RAX,1) |
(43) 0xb148 VXORPD %XMM0,%XMM0,%XMM0 |
(43) 0xb14c JMP b119 |
0xb14e XCHG %AX,%AX |
Path / |
Source file and lines | SPOSet.h:77-88 |
Module | libqmcwfs.so |
nb instructions | 27 |
nb uops | 27 |
loop length | 79 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 4.50 cycles |
front end | 4.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.00 | 0.40 | 2.67 | 2.67 | 3.50 | 0.40 | 1.00 | 3.50 | 3.50 | 3.50 | 0.20 | 2.67 |
cycles | 1.00 | 0.40 | 2.67 | 2.67 | 3.50 | 0.40 | 1.00 | 3.50 | 3.50 | 3.50 | 0.20 | 2.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 4.60-4.61 |
Stall cycles | 0.00 |
Front-end | 4.50 |
Dispatch | 3.50 |
Overall L1 | 4.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x18,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMPQ $0,0x260(%RSI) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JE b133 <_ZN11qmcplusplus6SPOSet17evaluateDetRatiosERKNS_18VirtualParticleSetERNS_6VectorIdSaIdEEERKS6_RSt6vectorIdS5_E+0x1a3> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RDI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RDX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R8,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
XOR %R14D,%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x18,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Source file and lines | SPOSet.h:77-88 |
Module | libqmcwfs.so |
nb instructions | 27 |
nb uops | 27 |
loop length | 79 |
used x86 registers | 13 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 4.50 cycles |
front end | 4.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.00 | 0.40 | 2.67 | 2.67 | 3.50 | 0.40 | 1.00 | 3.50 | 3.50 | 3.50 | 0.20 | 2.67 |
cycles | 1.00 | 0.40 | 2.67 | 2.67 | 3.50 | 0.40 | 1.00 | 3.50 | 3.50 | 3.50 | 0.20 | 2.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 4.60-4.61 |
Stall cycles | 0.00 |
Front-end | 4.50 |
Dispatch | 3.50 |
Overall L1 | 4.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
PUSH %R15 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R14 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R13 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %R12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
PUSH %RBX | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 5-12 | 0.50 |
SUB $0x18,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMPQ $0,0x260(%RSI) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV %RCX,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JE b133 <_ZN11qmcplusplus6SPOSet17evaluateDetRatiosERKNS_18VirtualParticleSetERNS_6VectorIdSaIdEEERKS6_RSt6vectorIdS5_E+0x1a3> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %RDI,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RSI,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %RDX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
MOV %R8,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
XOR %R14D,%R14D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x18,%RSP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
POP %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
POP %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1-6 | 0.33 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XCHG %AX,%AX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼_ZN11qmcplusplus6SPOSet17evaluateDetRatiosERKNS_18VirtualParticleSetERNS_6VectorIdSaIdEEERKS6_RSt6vectorIdS5_E– | 0.83 | 0.92 |
▼Loop 43 - SPOSet.h:83-86 - libqmcwfs.so– | 0 | 0 |
○Loop 44 - inner_product.hpp:82-83 - libqmcwfs.so | 0.83 | 0.88 |