Function: _ZN11qmcplusplus14OneBodyJastrowINS_14BsplineFunctorIdEEE8evalGradERNS_11ParticleSetEi | Module: libqmcwfs.so | Source: OneBodyJastrow.h:247-247 | Coverage: 0.01% |
---|
Function: _ZN11qmcplusplus14OneBodyJastrowINS_14BsplineFunctorIdEEE8evalGradERNS_11ParticleSetEi | Module: libqmcwfs.so | Source: OneBodyJastrow.h:247-247 | Coverage: 0.01% |
---|
/home/eoseret/qaas_runs_CPU_9468/171-145-9236/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/OneBodyJastrow.h: 247 - 247 |
-------------------------------------------------------------------------------- |
247: GradType evalGrad(ParticleSet& P, int iat) { return GradType(Grad[iat]); } |
0x33e80 MOV 0x190(%RSI),%RSI |
0x33e87 MOVSXD %ECX,%RCX |
0x33e8a MOV %RDI,%RAX |
0x33e8d LEA (%RCX,%RCX,2),%RDX |
0x33e91 LEA (%RSI,%RDX,8),%RDI |
0x33e95 VMOVDQU (%RDI),%XMM0 |
0x33e99 MOV 0x10(%RDI),%R8 |
0x33e9d VMOVDQU %XMM0,(%RAX) |
0x33ea1 MOV %R8,0x10(%RAX) |
0x33ea5 RET |
0x33ea6 NOPW %CS:(%RAX,%RAX,1) |
Path / |
Source file and lines | OneBodyJastrow.h:247-247 |
Module | libqmcwfs.so |
nb instructions | 10 |
nb uops | 10 |
loop length | 38 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 1.67 cycles |
front end | 1.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.33 | 1.33 | 1.33 | 1.00 | 0.33 | 0.50 | 1.00 | 1.00 | 1.00 | 0.33 | 1.33 |
cycles | 0.50 | 0.33 | 1.33 | 1.33 | 1.00 | 0.33 | 0.50 | 1.00 | 1.00 | 1.00 | 0.33 | 1.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 5.05 |
Stall cycles | 2.94 |
LM full (events) | 4.41 |
Front-end | 1.67 |
Dispatch | 1.33 |
Overall L1 | 1.67 |
all | 28% |
load | 33% |
store | 50% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 16% |
load | 16% |
store | 18% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0x190(%RSI),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %ECX,%RCX | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%RCX,%RCX,2),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%RSI,%RDX,8),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVDQU (%RDI),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
MOV 0x10(%RDI),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVDQU %XMM0,(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 0.50 |
MOV %R8,0x10(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
Source file and lines | OneBodyJastrow.h:247-247 |
Module | libqmcwfs.so |
nb instructions | 10 |
nb uops | 10 |
loop length | 38 |
used x86 registers | 6 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 1.67 cycles |
front end | 1.67 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.33 | 1.33 | 1.33 | 1.00 | 0.33 | 0.50 | 1.00 | 1.00 | 1.00 | 0.33 | 1.33 |
cycles | 0.50 | 0.33 | 1.33 | 1.33 | 1.00 | 0.33 | 0.50 | 1.00 | 1.00 | 1.00 | 0.33 | 1.33 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 5.05 |
Stall cycles | 2.94 |
LM full (events) | 4.41 |
Front-end | 1.67 |
Dispatch | 1.33 |
Overall L1 | 1.67 |
all | 28% |
load | 33% |
store | 50% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 16% |
load | 16% |
store | 18% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0x190(%RSI),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %ECX,%RCX | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
LEA (%RCX,%RCX,2),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%RSI,%RDX,8),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVDQU (%RDI),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
MOV 0x10(%RDI),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVDQU %XMM0,(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 0.50 |
MOV %R8,0x10(%RAX) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
RET | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0.33 | 0 | 2.13 |
Name | Coverage (%) | Time (s) |
---|---|---|
○_ZN11qmcplusplus14OneBodyJastrowINS_14BsplineFunctorIdEEE8evalGradERNS_11ParticleSetEi | 0.01 | 0.01 |