Loop Id: 1085 | Module: libkripke.so | Source: forall.hpp:59-59 [...] | Coverage: 0.05% |
---|
Loop Id: 1085 | Module: libkripke.so | Source: forall.hpp:59-59 [...] | Coverage: 0.05% |
---|
0x6cb40 VMOVQ %XMM10,%R8 |
0x6cb45 MOV -0x50(%RBP),%RBX |
0x6cb49 MOV -0x58(%RBP),%RDX |
0x6cb4d VMOVQ %XMM8,%R10 |
0x6cb52 MOV (%R8),%R12 |
0x6cb55 ADD %RDI,%R10 |
0x6cb58 XOR %R8D,%R8D |
0x6cb5b MOV %RDI,-0x38(%RBP) |
0x6cb5f MOV %R9,-0x40(%RBP) |
0x6cb63 MOV %RAX,%RDI |
0x6cb66 MOV %R8,%RAX |
0x6cb69 IMUL %RBX,%R12 |
0x6cb6d LEA (%RDX,%R9,8),%RBX |
0x6cb71 VMOVQ %XMM11,%R9 |
0x6cb76 ADD %R10,%R12 |
0x6cb79 NOPL (%RAX) |
(1084) 0x6cb80 MOV (%R11,%RAX,8),%R10 |
(1084) 0x6cb84 MOVSXD (%R9,%RAX,4),%R8 |
(1084) 0x6cb88 ADD %R10,%R8 |
(1084) 0x6cb8b CMP %R8,%R10 |
(1084) 0x6cb8e JGE 6ce10 |
(1084) 0x6cb94 SAL $0x3,%R8 |
(1084) 0x6cb98 VXORPD %XMM0,%XMM0,%XMM0 |
(1084) 0x6cb9c SAL $0x3,%R10 |
(1084) 0x6cba0 MOV %R8,%RDX |
(1084) 0x6cba3 SUB %R10,%RDX |
(1084) 0x6cba6 SUB $0x8,%RDX |
(1084) 0x6cbaa SHR $0x3,%RDX |
(1084) 0x6cbae INC %RDX |
(1084) 0x6cbb1 AND $0x7,%EDX |
(1084) 0x6cbb4 JE 6ccb7 |
(1084) 0x6cbba CMP $0x1,%RDX |
(1084) 0x6cbbe JE 6cc92 |
(1084) 0x6cbc4 CMP $0x2,%RDX |
(1084) 0x6cbc8 JE 6cc76 |
(1084) 0x6cbce CMP $0x3,%RDX |
(1084) 0x6cbd2 JE 6cc5a |
(1084) 0x6cbd8 CMP $0x4,%RDX |
(1084) 0x6cbdc JE 6cc3e |
(1084) 0x6cbde CMP $0x5,%RDX |
(1084) 0x6cbe2 JE 6cc22 |
(1084) 0x6cbe4 CMP $0x6,%RDX |
(1084) 0x6cbe8 JE 6cc06 |
(1084) 0x6cbea MOV (%R13,%R10,1),%RDX |
(1084) 0x6cbef VMOVSD (%R14,%R10,1),%XMM5 |
(1084) 0x6cbf5 ADD $0x8,%R10 |
(1084) 0x6cbf9 IMUL %RCX,%RDX |
(1084) 0x6cbfd ADD %R12,%RDX |
(1084) 0x6cc00 VFMADD231SD (%R15,%RDX,8),%XMM5,%XMM0 |
(1084) 0x6cc06 MOV (%R13,%R10,1),%RDX |
(1084) 0x6cc0b VMOVSD (%R14,%R10,1),%XMM1 |
(1084) 0x6cc11 ADD $0x8,%R10 |
(1084) 0x6cc15 IMUL %RCX,%RDX |
(1084) 0x6cc19 ADD %R12,%RDX |
(1084) 0x6cc1c VFMADD231SD (%R15,%RDX,8),%XMM1,%XMM0 |
(1084) 0x6cc22 MOV (%R13,%R10,1),%RDX |
(1084) 0x6cc27 VMOVSD (%R14,%R10,1),%XMM3 |
(1084) 0x6cc2d ADD $0x8,%R10 |
(1084) 0x6cc31 IMUL %RCX,%RDX |
(1084) 0x6cc35 ADD %R12,%RDX |
(1084) 0x6cc38 VFMADD231SD (%R15,%RDX,8),%XMM3,%XMM0 |
(1084) 0x6cc3e MOV (%R13,%R10,1),%RDX |
(1084) 0x6cc43 VMOVSD (%R14,%R10,1),%XMM7 |
(1084) 0x6cc49 ADD $0x8,%R10 |
(1084) 0x6cc4d IMUL %RCX,%RDX |
(1084) 0x6cc51 ADD %R12,%RDX |
(1084) 0x6cc54 VFMADD231SD (%R15,%RDX,8),%XMM7,%XMM0 |
(1084) 0x6cc5a MOV (%R13,%R10,1),%RDX |
(1084) 0x6cc5f VMOVSD (%R14,%R10,1),%XMM9 |
(1084) 0x6cc65 ADD $0x8,%R10 |
(1084) 0x6cc69 IMUL %RCX,%RDX |
(1084) 0x6cc6d ADD %R12,%RDX |
(1084) 0x6cc70 VFMADD231SD (%R15,%RDX,8),%XMM9,%XMM0 |
(1084) 0x6cc76 MOV (%R13,%R10,1),%RDX |
(1084) 0x6cc7b VMOVSD (%R14,%R10,1),%XMM5 |
(1084) 0x6cc81 ADD $0x8,%R10 |
(1084) 0x6cc85 IMUL %RCX,%RDX |
(1084) 0x6cc89 ADD %R12,%RDX |
(1084) 0x6cc8c VFMADD231SD (%R15,%RDX,8),%XMM5,%XMM0 |
(1084) 0x6cc92 MOV (%R13,%R10,1),%RDX |
(1084) 0x6cc97 VMOVSD (%R14,%R10,1),%XMM1 |
(1084) 0x6cc9d ADD $0x8,%R10 |
(1084) 0x6cca1 IMUL %RCX,%RDX |
(1084) 0x6cca5 ADD %R12,%RDX |
(1084) 0x6cca8 VFMADD231SD (%R15,%RDX,8),%XMM1,%XMM0 |
(1084) 0x6ccae CMP %R8,%R10 |
(1084) 0x6ccb1 JE 6cd8b |
(1086) 0x6ccb7 MOV (%R13,%R10,1),%RDX |
(1086) 0x6ccbc VMOVSD (%R14,%R10,1),%XMM3 |
(1086) 0x6ccc2 VMOVSD 0x8(%R14,%R10,1),%XMM7 |
(1086) 0x6ccc9 VMOVSD 0x10(%R14,%R10,1),%XMM9 |
(1086) 0x6ccd0 VMOVSD 0x18(%R14,%R10,1),%XMM5 |
(1086) 0x6ccd7 VMOVSD 0x20(%R14,%R10,1),%XMM1 |
(1086) 0x6ccde IMUL %RCX,%RDX |
(1086) 0x6cce2 ADD %R12,%RDX |
(1086) 0x6cce5 VFMADD231SD (%R15,%RDX,8),%XMM3,%XMM0 |
(1086) 0x6cceb MOV 0x8(%R13,%R10,1),%RDX |
(1086) 0x6ccf0 VMOVSD 0x28(%R14,%R10,1),%XMM3 |
(1086) 0x6ccf7 IMUL %RCX,%RDX |
(1086) 0x6ccfb ADD %R12,%RDX |
(1086) 0x6ccfe VFMADD231SD (%R15,%RDX,8),%XMM7,%XMM0 |
(1086) 0x6cd04 MOV 0x10(%R13,%R10,1),%RDX |
(1086) 0x6cd09 VMOVSD 0x30(%R14,%R10,1),%XMM7 |
(1086) 0x6cd10 IMUL %RCX,%RDX |
(1086) 0x6cd14 ADD %R12,%RDX |
(1086) 0x6cd17 VFMADD231SD (%R15,%RDX,8),%XMM9,%XMM0 |
(1086) 0x6cd1d MOV 0x18(%R13,%R10,1),%RDX |
(1086) 0x6cd22 VMOVSD 0x38(%R14,%R10,1),%XMM9 |
(1086) 0x6cd29 IMUL %RCX,%RDX |
(1086) 0x6cd2d ADD %R12,%RDX |
(1086) 0x6cd30 VFMADD231SD (%R15,%RDX,8),%XMM5,%XMM0 |
(1086) 0x6cd36 MOV 0x20(%R13,%R10,1),%RDX |
(1086) 0x6cd3b IMUL %RCX,%RDX |
(1086) 0x6cd3f ADD %R12,%RDX |
(1086) 0x6cd42 VFMADD231SD (%R15,%RDX,8),%XMM1,%XMM0 |
(1086) 0x6cd48 MOV 0x28(%R13,%R10,1),%RDX |
(1086) 0x6cd4d IMUL %RCX,%RDX |
(1086) 0x6cd51 ADD %R12,%RDX |
(1086) 0x6cd54 VFMADD231SD (%R15,%RDX,8),%XMM3,%XMM0 |
(1086) 0x6cd5a MOV 0x30(%R13,%R10,1),%RDX |
(1086) 0x6cd5f IMUL %RCX,%RDX |
(1086) 0x6cd63 ADD %R12,%RDX |
(1086) 0x6cd66 VFMADD231SD (%R15,%RDX,8),%XMM7,%XMM0 |
(1086) 0x6cd6c MOV 0x38(%R13,%R10,1),%RDX |
(1086) 0x6cd71 ADD $0x40,%R10 |
(1086) 0x6cd75 IMUL %RCX,%RDX |
(1086) 0x6cd79 ADD %R12,%RDX |
(1086) 0x6cd7c VFMADD231SD (%R15,%RDX,8),%XMM9,%XMM0 |
(1086) 0x6cd82 CMP %R8,%R10 |
(1086) 0x6cd85 JNE 6ccb7 |
(1084) 0x6cd8b VMOVSD (%RSI,%RAX,8),%XMM5 |
(1084) 0x6cd90 VFMADD132SD (%RBX,%RAX,8),%XMM5,%XMM0 |
(1084) 0x6cd96 VMOVSD %XMM0,(%RSI,%RAX,8) |
(1084) 0x6cd9b INC %RAX |
(1084) 0x6cd9e CMP %RDI,%RAX |
(1084) 0x6cda1 JNE 6cb80 |
0x6cda7 MOV %RDI,%RAX |
0x6cdaa MOV -0x38(%RBP),%RDI |
0x6cdae MOV -0x40(%RBP),%R9 |
0x6cdb2 MOV -0x48(%RBP),%R12 |
0x6cdb6 VMOVQ %XMM4,%RBX |
0x6cdbb INC %RDI |
0x6cdbe ADD %R12,%R9 |
0x6cdc1 CMP %RBX,%RDI |
0x6cdc4 JNE 6cb40 |
(1084) 0x6ce10 VXORPD %XMM0,%XMM0,%XMM0 |
(1084) 0x6ce14 JMP 6cd8b |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-6313/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/index/IndexValue.hpp: 105 - 105 |
-------------------------------------------------------------------------------- |
105: return TYPE(value + a); |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-6313/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/util/Layout.hpp: 55 - 55 |
-------------------------------------------------------------------------------- |
55: return a * b; |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-6313/intel/Kripke/build/Kripke/src/Kripke/Kernel/Scattering.cpp: 87 - 97 |
-------------------------------------------------------------------------------- |
87: MixElem mix_start = zone_to_mixelem(z); |
88: MixElem mix_stop = mix_start + zone_to_num_mixelem(z); |
89: |
90: double sigs_z = 0.0; |
91: for(MixElem mix = mix_start;mix < mix_stop;++ mix){ |
92: Material mat = mixelem_to_material(mix); |
93: double fraction = mixelem_to_fraction(mix); |
94: |
95: sigs_z += sigs(mat, n, global_g, global_gp) * fraction; |
96: } |
97: phi_out(nm, g, z) += sigs_z * phi(nm, gp, z); |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-6313/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/policy/loop/forall.hpp: 59 - 59 |
-------------------------------------------------------------------------------- |
59: for (decltype(distance_it) i = 0; i < distance_it; ++i) { |
/beegfs/hackathon/users/eoseret/qaas_runs/170-850-6313/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/util/View.hpp: 110 - 110 |
-------------------------------------------------------------------------------- |
110: return data[idx]; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.92 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 6.57 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.44 |
Bottlenecks | micro-operation queue, |
Function | _ZN4RAJA8internal17StatementExecutorINS_9statement8CollapseINS_26omp_parallel_collapse_execEN4camp7int_seqIlJLl0ELl1EEEEJNS2_3ForILl2ENS_6policy4loop9loop_execEJNS8_ILl3ESB_JNS2_6LambdaILl0EJEEEEEEEEEEEEE4execIRNS0_8LoopDataINS5_4listIJSG_EEENS5_5tupleIJNS_4impl4SpanINS_9Iterators16numeric_iteratorIN6Kripke6MomentElPSS_EESS_EENSO_INSQ_INSR_5GroupElPSW_EESW_EESZ_NSO_INSQ_INSR_4ZoneElPS10_EES10_EEEEENSM_IJEEEJZNK14ScatteringSdomclINSR_11ArchLayoutTINSR_12ArchT_OpenMPENSR_11LayoutT_DGZEEEEEvT_NSR_6SdomIdES1D_RKNSR_4Core3SetES1H_S1H_RNS1E_5FieldIdJSS_SW_S10_EEES1K_RNS1I_IdJNSR_8MaterialENSR_8LegendreENSR_11GlobalGroupES1N_EEERNS1I_INSR_7MixElemEJS10_EEERNS1I_IiJS10_EEERNS1I_IS1L_JS1Q_EEERNS1I_IdJS1Q_EEERNS1I_IS1M_JSS_EEEEUlSS_SW_SW_S10_E_EEEEEvOS1C_._omp_fn.0 |
Source | Layout.hpp:55-55,forall.hpp:59-59,View.hpp:110-110 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 3.83 |
CQA cycles if no scalar integer | 2.00 |
CQA cycles if FP arith vectorized | 3.83 |
CQA cycles if fully vectorized | 0.58 |
Front-end cycles | 3.83 |
DIV/SQRT cycles | 1.75 |
P0 cycles | 1.75 |
P1 cycles | 1.50 |
P2 cycles | 1.50 |
P3 cycles | 0.50 |
P4 cycles | 2.67 |
P5 cycles | 2.67 |
P6 cycles | 2.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 2.00 |
P12 cycles | 2.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 25.00 |
Nb uops | 23.00 |
Nb loads | 6.00 |
Nb stores | 2.00 |
Nb stack references | 5.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.70 |
Bytes prefetched | 0.00 |
Bytes loaded | 48.00 |
Bytes stored | 16.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.92 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 6.57 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.44 |
Bottlenecks | micro-operation queue, |
Function | _ZN4RAJA8internal17StatementExecutorINS_9statement8CollapseINS_26omp_parallel_collapse_execEN4camp7int_seqIlJLl0ELl1EEEEJNS2_3ForILl2ENS_6policy4loop9loop_execEJNS8_ILl3ESB_JNS2_6LambdaILl0EJEEEEEEEEEEEEE4execIRNS0_8LoopDataINS5_4listIJSG_EEENS5_5tupleIJNS_4impl4SpanINS_9Iterators16numeric_iteratorIN6Kripke6MomentElPSS_EESS_EENSO_INSQ_INSR_5GroupElPSW_EESW_EESZ_NSO_INSQ_INSR_4ZoneElPS10_EES10_EEEEENSM_IJEEEJZNK14ScatteringSdomclINSR_11ArchLayoutTINSR_12ArchT_OpenMPENSR_11LayoutT_DGZEEEEEvT_NSR_6SdomIdES1D_RKNSR_4Core3SetES1H_S1H_RNS1E_5FieldIdJSS_SW_S10_EEES1K_RNS1I_IdJNSR_8MaterialENSR_8LegendreENSR_11GlobalGroupES1N_EEERNS1I_INSR_7MixElemEJS10_EEERNS1I_IiJS10_EEERNS1I_IS1L_JS1Q_EEERNS1I_IdJS1Q_EEERNS1I_IS1M_JSS_EEEEUlSS_SW_SW_S10_E_EEEEEvOS1C_._omp_fn.0 |
Source | Layout.hpp:55-55,forall.hpp:59-59,View.hpp:110-110 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 3.83 |
CQA cycles if no scalar integer | 2.00 |
CQA cycles if FP arith vectorized | 3.83 |
CQA cycles if fully vectorized | 0.58 |
Front-end cycles | 3.83 |
DIV/SQRT cycles | 1.75 |
P0 cycles | 1.75 |
P1 cycles | 1.50 |
P2 cycles | 1.50 |
P3 cycles | 0.50 |
P4 cycles | 2.67 |
P5 cycles | 2.67 |
P6 cycles | 2.67 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.00 |
P10 cycles | 0.00 |
P11 cycles | 2.00 |
P12 cycles | 2.00 |
P13 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 25.00 |
Nb uops | 23.00 |
Nb loads | 6.00 |
Nb stores | 2.00 |
Nb stack references | 5.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.70 |
Bytes prefetched | 0.00 |
Bytes loaded | 48.00 |
Bytes stored | 16.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
nb instructions | 25 |
nb uops | 23 |
loop length | 99 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 5 |
micro-operation queue | 3.83 cycles |
front end | 3.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.75 | 1.75 | 1.50 | 1.50 | 0.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | 2.00 |
cycles | 1.75 | 1.75 | 1.50 | 1.50 | 0.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | 2.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 3.83 |
Dispatch | 2.67 |
Overall L1 | 3.83 |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVQ %XMM10,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV -0x50(%RBP),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x58(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVQ %XMM8,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV (%R8),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %RDI,%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RDI,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %R9,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %RAX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %RBX,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RDX,%R9,8),%RBX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVQ %XMM11,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
ADD %R10,%R12 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
NOPL (%RAX) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.09 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x38(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x48(%RBP),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVQ %XMM4,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
INC %RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %R12,%R9 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP %RBX,%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JNE 6cb40 <_ZN4RAJA8internal17StatementExecutorINS_9statement8CollapseINS_26omp_parallel_collapse_execEN4camp7int_seqIlJLl0ELl1EEEEJNS2_3ForILl2ENS_6policy4loop9loop_execEJNS8_ILl3ESB_JNS2_6LambdaILl0EJEEEEEEEEEEEEE4execIRNS0_8LoopDataINS5_4listIJSG_EEENS5_5tupleIJNS_4impl4SpanINS_9Iterators16numeric_iteratorIN6Kripke6MomentElPSS_EESS_EENSO_INSQ_INSR_5GroupElPSW_EESW_EESZ_NSO_INSQ_INSR_4ZoneElPS10_EES10_EEEEENSM_IJEEEJZNK14ScatteringSdomclINSR_11ArchLayoutTINSR_12ArchT_OpenMPENSR_11LayoutT_DGZEEEEEvT_NSR_6SdomIdES1D_RKNSR_4Core3SetES1H_S1H_RNS1E_5FieldIdJSS_SW_S10_EEES1K_RNS1I_IdJNSR_8MaterialENSR_8LegendreENSR_11GlobalGroupES1N_EEERNS1I_INSR_7MixElemEJS10_EEERNS1I_IiJS10_EEERNS1I_IS1L_JS1Q_EEERNS1I_IdJS1Q_EEERNS1I_IS1M_JSS_EEEEUlSS_SW_SW_S10_E_EEEEEvOS1C_._omp_fn.0+0x300> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |
nb instructions | 25 |
nb uops | 23 |
loop length | 99 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 4 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 5 |
micro-operation queue | 3.83 cycles |
front end | 3.83 cycles |
ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.75 | 1.75 | 1.50 | 1.50 | 0.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | 2.00 |
cycles | 1.75 | 1.75 | 1.50 | 1.50 | 0.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | 2.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 3.83 |
Dispatch | 2.67 |
Overall L1 | 3.83 |
all | 0% |
load | 0% |
store | 0% |
mul | 0% |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | 12% |
mul | 12% |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | ALU0/BRU0 | ALU1 | ALU2 | ALU3 | BRU1 | AGU0 | AGU1 | AGU2 | FP0 | FP1 | FP2 | FP3 | FP4 | FP5 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VMOVQ %XMM10,%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV -0x50(%RBP),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x58(%RBP),%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVQ %XMM8,%R10 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
MOV (%R8),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
ADD %RDI,%R10 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
XOR %R8D,%R8D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV %RDI,-0x38(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %R9,-0x40(%RBP) | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
MOV %RAX,%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
IMUL %RBX,%R12 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA (%RDX,%R9,8),%RBX | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
VMOVQ %XMM11,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
ADD %R10,%R12 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
NOPL (%RAX) | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.09 |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV -0x38(%RBP),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x40(%RBP),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
MOV -0x48(%RBP),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.33 |
VMOVQ %XMM4,%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 1 | 1 |
INC %RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD %R12,%R9 | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP %RBX,%RDI | 1 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
JNE 6cb40 <_ZN4RAJA8internal17StatementExecutorINS_9statement8CollapseINS_26omp_parallel_collapse_execEN4camp7int_seqIlJLl0ELl1EEEEJNS2_3ForILl2ENS_6policy4loop9loop_execEJNS8_ILl3ESB_JNS2_6LambdaILl0EJEEEEEEEEEEEEE4execIRNS0_8LoopDataINS5_4listIJSG_EEENS5_5tupleIJNS_4impl4SpanINS_9Iterators16numeric_iteratorIN6Kripke6MomentElPSS_EESS_EENSO_INSQ_INSR_5GroupElPSW_EESW_EESZ_NSO_INSQ_INSR_4ZoneElPS10_EES10_EEEEENSM_IJEEEJZNK14ScatteringSdomclINSR_11ArchLayoutTINSR_12ArchT_OpenMPENSR_11LayoutT_DGZEEEEEvT_NSR_6SdomIdES1D_RKNSR_4Core3SetES1H_S1H_RNS1E_5FieldIdJSS_SW_S10_EEES1K_RNS1I_IdJNSR_8MaterialENSR_8LegendreENSR_11GlobalGroupES1N_EEERNS1I_INSR_7MixElemEJS10_EEERNS1I_IiJS10_EEERNS1I_IS1L_JS1Q_EEERNS1I_IdJS1Q_EEERNS1I_IS1M_JSS_EEEEUlSS_SW_SW_S10_E_EEEEEvOS1C_._omp_fn.0+0x300> | 1 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50-1 |