Loop Id: 975 | Module: exec | Source: MultiBsplineRef.hpp:226-262 [...] | Coverage: 0.01% |
---|
Loop Id: 975 | Module: exec | Source: MultiBsplineRef.hpp:226-262 [...] | Coverage: 0.01% |
---|
(977) 0x433320 LDR X4, [SP, #104] |
(977) 0x433324 LDP X16, X26, [SP, #152] |
(977) 0x433328 LDP X18, X17, [SP, #136] |
(977) 0x43332c LDP X2, X1, [SP, #120] |
(974) 0x433330 LDR X9, [SP, #112] |
(974) 0x433334 ADD X26, X26, #1 |
(974) 0x433338 CMP X26, #4 |
(974) 0x43333c ADD X16, X16, X9 |
(974) 0x433340 ADD X17, X17, X9 |
(974) 0x433344 ADD X1, X1, X9 |
(974) 0x433348 ADD X2, X2, X9 |
(974) 0x43334c ADD X18, X18, X9 |
(974) 0x433350 B.EQ 4336b4 |
(974) 0x433354 CMP W4, #1 |
(974) 0x433358 B.LT 433330 |
(977) 0x43335c ADD X9, SP, #320 |
(977) 0x433360 CMP X8, #8 |
(977) 0x433364 ORR X10, XZR, XZR |
(977) 0x433368 ORR X0, XZR, X1 |
(977) 0x43336c STP X16, X26, [SP, #152] |
(977) 0x433370 STP X1, X18, [SP, #128] |
(977) 0x433374 STR X2, [SP, #120] |
(977) 0x433378 LDR D2, [X9, X26,LSL #3] |
(977) 0x43337c ADD X9, SP, #384 |
(977) 0x433380 STR X17, [SP, #144] |
(977) 0x433384 LDR D0, [X9, X26,LSL #3] |
(977) 0x433388 MOVZ W9, #8 |
(977) 0x43338c CSEL X11, X8, X9, #8 |
(977) 0x433390 ADD X9, SP, #448 |
(977) 0x433394 STP D0, D2, [SP, #272] |
(977) 0x433398 LDR D0, [X9, X26,LSL #3] |
(977) 0x43339c STR D0, [SP, #264] |
(977) 0x4333a0 B 4333c4 |
(977) 0x4333a4 ADD X10, X10, #1 |
(977) 0x4333a8 ADD X16, X16, X15 |
(977) 0x4333ac ADD X0, X0, X15 |
(977) 0x4333b0 CMP X10, #4 |
(977) 0x4333b4 ADD X2, X2, X15 |
(977) 0x4333b8 ADD X18, X18, X15 |
(977) 0x4333bc ADD X17, X17, X15 |
(977) 0x4333c0 B.EQ 433320 |
(977) 0x4333c4 ADD X13, SP, #416 |
(977) 0x4333c8 LDR D0, [SP, #280] |
(977) 0x4333cc CMP X12, X11 |
(977) 0x4333d0 LDR D20, [X13, X10,LSL #3] |
(977) 0x4333d4 ADD X13, SP, #352 |
(977) 0x4333d8 CSINC W9, WZR, WZR, #2 |
(977) 0x4333dc FMUL D23, D20, D0 |
(977) 0x4333e0 LDP D2, D0, [SP, #264] |
(977) 0x4333e4 FMUL D24, D0, D20 |
(977) 0x4333e8 FMUL D25, D2, D20 |
(977) 0x4333ec LDR D20, [X13, X10,LSL #3] |
(977) 0x4333f0 ADD X13, SP, #288 |
(977) 0x4333f4 FMUL D26, D20, D0 |
(977) 0x4333f8 FMUL D27, D20, D2 |
(977) 0x4333fc LDR D20, [X13, X10,LSL #3] |
(977) 0x433400 LDR W13, [SP, #256] |
(977) 0x433404 ORR W9, W9, W13 |
(977) 0x433408 FMUL D28, D20, D2 |
(977) 0x43340c TBZ W9, #0, 433420 |
(977) 0x433410 ORR X5, XZR, XZR |
(977) 0x433414 B 433558 |
0x433420 UDIV X13, X12, X8 |
0x433424 ORR X9, XZR, XZR |
0x433428 DUP Z29.D, Z23.D[0] |
0x43342c DUP Z30.D, Z26.D[0] |
0x433430 DUP Z31.D, Z24.D[0] |
0x433434 DUP Z20.D, Z28.D[0] |
0x433438 DUP Z21.D, Z27.D[0] |
0x43343c DUP Z22.D, Z25.D[0] |
0x433440 UMSUBL X14, W13, W8, X12 |
0x433444 UMADDL X5, W13, W8, XZR |
0x433448 SUB X13, X29, #64 |
0x43344c LDR Z0, [X13, #510, MUL VL] |
0x433450 SUB X13, X29, #64 |
0x433454 LDR Z1, [X13, #509, MUL VL] |
0x433458 SUB X13, X29, #64 |
0x43345c LDR Z2, [X13, #508, MUL VL] |
0x433460 SUB X13, X29, #64 |
0x433464 LDR Z3, [X13, #507, MUL VL] |
(970) 0x433468 LD1D {Z8.D}, P0/Z, [X16, X9,LSL #3] |
(970) 0x43346c LD1D {Z9.D}, P0/Z, [X0, X9,LSL #3] |
(970) 0x433470 ADD X13, X21, X9,LSL #3 |
(970) 0x433474 FMUL Z13.D, Z8.D, Z0.D |
(970) 0x433478 LD1D {Z10.D}, P0/Z, [X2, X9,LSL #3] |
(970) 0x43347c LD1D {Z12.D}, P0/Z, [X18, X9,LSL #3] |
(970) 0x433480 FMUL Z14.D, Z8.D, Z4.D |
(970) 0x433484 FMUL Z15.D, Z7.D, Z12.D |
(970) 0x433488 FMUL Z8.D, Z8.D, Z17.D |
(970) 0x43348c FMLA Z13.D, P0/M, Z9.D, Z1.D |
(970) 0x433490 FMLA Z15.D, P0/M, Z9.D, Z5.D |
(970) 0x433494 FMLA Z8.D, P0/M, Z9.D, Z18.D |
(970) 0x433498 LD1D {Z9.D}, P0/Z, [X13, MUL VL] |
(970) 0x43349c FMLA Z14.D, P0/M, Z10.D, Z6.D |
(970) 0x4334a0 FMLA Z13.D, P0/M, Z10.D, Z2.D |
(970) 0x4334a4 FMLA Z14.D, P0/M, Z15.D, Z16.D |
(970) 0x4334a8 FMLA Z8.D, P0/M, Z10.D, Z19.D |
(970) 0x4334ac FMLA Z13.D, P0/M, Z3.D, Z12.D |
(970) 0x4334b0 FMLA Z8.D, P0/M, Z12.D, Z16.D |
(970) 0x4334b4 FMLA Z9.D, P0/M, Z29.D, Z13.D |
(970) 0x4334b8 ST1D {Z9.D}, P0, [X13, MUL VL] |
(970) 0x4334bc ADD X13, X13, X27,LSL #3 |
(970) 0x4334c0 LD1D {Z9.D}, P0/Z, [X13, MUL VL] |
(970) 0x4334c4 FMLA Z9.D, P0/M, Z30.D, Z13.D |
(970) 0x4334c8 ST1D {Z9.D}, P0, [X13, MUL VL] |
(970) 0x4334cc ADD X13, X13, X28 |
(970) 0x4334d0 LD1D {Z9.D}, P0/Z, [X13, MUL VL] |
(970) 0x4334d4 FMLA Z9.D, P0/M, Z14.D, Z31.D |
(970) 0x4334d8 ST1D {Z9.D}, P0, [X13, MUL VL] |
(970) 0x4334dc ADD X13, X13, X28 |
(970) 0x4334e0 LD1D {Z9.D}, P0/Z, [X13, MUL VL] |
(970) 0x4334e4 FMLA Z9.D, P0/M, Z20.D, Z13.D |
(970) 0x4334e8 ST1D {Z9.D}, P0, [X13, MUL VL] |
(970) 0x4334ec ADD X13, X13, X28 |
(970) 0x4334f0 LD1D {Z9.D}, P0/Z, [X13, MUL VL] |
(970) 0x4334f4 FMLA Z9.D, P0/M, Z14.D, Z21.D |
(970) 0x4334f8 ST1D {Z9.D}, P0, [X13, MUL VL] |
(970) 0x4334fc ADD X13, X13, X28 |
(970) 0x433500 LD1D {Z9.D}, P0/Z, [X13, MUL VL] |
(970) 0x433504 FMAD Z8.D, P0/M, Z22.D, Z9.D |
(970) 0x433508 ST1D {Z8.D}, P0, [X13, MUL VL] |
(970) 0x43350c LD1D {Z8.D}, P0/Z, [X20, X9,LSL #3] |
(970) 0x433510 FMLA Z8.D, P0/M, Z13.D, Z31.D |
(970) 0x433514 ST1D {Z8.D}, P0, [X20, X9,LSL #3] |
(970) 0x433518 LD1D {Z8.D}, P0/Z, [X23, X9,LSL #3] |
(970) 0x43351c FMLA Z8.D, P0/M, Z13.D, Z21.D |
(970) 0x433520 ST1D {Z8.D}, P0, [X23, X9,LSL #3] |
(970) 0x433524 LD1D {Z8.D}, P0/Z, [X24, X9,LSL #3] |
(970) 0x433528 FMLA Z8.D, P0/M, Z14.D, Z22.D |
(970) 0x43352c ST1D {Z8.D}, P0, [X24, X9,LSL #3] |
(970) 0x433530 LD1D {Z8.D}, P0/Z, [X22, X9,LSL #3] |
(970) 0x433534 FMLA Z8.D, P0/M, Z13.D, Z22.D |
(970) 0x433538 ST1D {Z8.D}, P0, [X22, X9,LSL #3] |
(970) 0x43353c ADD X9, X9, X8 |
(970) 0x433540 CMP X5, X9 |
(970) 0x433544 B.NE 433468 |
0x433548 LDP D15, D8, [SP, #192] |
0x43354c LDP D1, D31, [SP, #176] |
0x433550 LDR D3, [SP, #168] |
0x433554 CBZ X14, 4333a4 |
(977) 0x433558 LDP D10, D9, [SP, #240] |
(977) 0x43355c LDP D13, D12, [SP, #224] |
(977) 0x433560 LDP D0, D14, [SP, #208] |
(977) 0x433564 UBFM X26, X5, #61, #60 |
(977) 0x433568 ORR X6, XZR, XZR |
(977) 0x43356c SUB X5, X12, X5 |
(977) 0x433570 ADD X14, X17, X26 |
(977) 0x433574 ADD X9, X0, X26 |
(977) 0x433578 ADD X4, X2, X26 |
(977) 0x43357c ADD X3, X18, X26 |
(977) 0x433580 ADD X13, X21, X26 |
(977) 0x433584 ADD X25, X20, X26 |
(977) 0x433588 ADD X19, X23, X26 |
(977) 0x43358c ADD X7, X24, X26 |
(977) 0x433590 ADD X26, X22, X26 |
(977) 0x433594 HINT #0 |
(977) 0x433598 HINT #0 |
(977) 0x43359c HINT #0 |
(976) 0x4335a0 UBFM X30, X6, #61, #60 |
(976) 0x4335a4 FMOV D2, D31 |
(976) 0x4335a8 FMOV D4, D3 |
(976) 0x4335ac ADD X6, X6, #1 |
(976) 0x4335b0 LDR D20, [X14, X30] |
(976) 0x4335b4 LDR D21, [X9, X30] |
(976) 0x4335b8 LDR D22, [X4, X30] |
(976) 0x4335bc LDR D29, [X3, X30] |
(976) 0x4335c0 ADD X1, X13, X30 |
(976) 0x4335c4 CMP X5, X6 |
(976) 0x4335c8 FMUL D30, D20, D8 |
(976) 0x4335cc FMADD D30, D21, D15, D30 |
(976) 0x4335d0 FMADD D30, D22, D31, D30 |
(976) 0x4335d4 FMUL D31, D20, D3 |
(976) 0x4335d8 FMOV D3, D1 |
(976) 0x4335dc FMUL D20, D20, D12 |
(976) 0x4335e0 FMADD D31, D22, D10, D31 |
(976) 0x4335e4 FMADD D20, D21, D13, D20 |
(976) 0x4335e8 FMADD D30, D1, D29, D30 |
(976) 0x4335ec FMOV D1, D15 |
(976) 0x4335f0 FMOV D15, D8 |
(976) 0x4335f4 FMUL D8, D21, D9 |
(976) 0x4335f8 FMADD D8, D0, D29, D8 |
(976) 0x4335fc FMADD D20, D22, D14, D20 |
(976) 0x433600 LDR D22, [X25, X30] |
(976) 0x433604 FMADD D31, D8, D11, D31 |
(976) 0x433608 LDR D8, [X1] |
(976) 0x43360c FMADD D20, D29, D11, D20 |
(976) 0x433610 FMADD D22, D30, D24, D22 |
(976) 0x433614 FMADD D8, D23, D30, D8 |
(976) 0x433618 STR D22, [X25, X30] |
(976) 0x43361c LDR D22, [X19, X30] |
(976) 0x433620 STR D8, [X1] |
(976) 0x433624 ADD X1, X1, X28 |
(976) 0x433628 LDR D8, [X1] |
(976) 0x43362c FMADD D22, D30, D27, D22 |
(976) 0x433630 STR D22, [X19, X30] |
(976) 0x433634 FMADD D8, D26, D30, D8 |
(976) 0x433638 LDR D22, [X7, X30] |
(976) 0x43363c STR D8, [X1] |
(976) 0x433640 ADD X1, X1, X28 |
(976) 0x433644 FMOV D8, D15 |
(976) 0x433648 FMOV D15, D1 |
(976) 0x43364c FMOV D1, D3 |
(976) 0x433650 FMOV D3, D4 |
(976) 0x433654 LDR D21, [X1] |
(976) 0x433658 FMADD D22, D31, D25, D22 |
(976) 0x43365c STR D22, [X7, X30] |
(976) 0x433660 FMADD D21, D31, D24, D21 |
(976) 0x433664 STR D21, [X1] |
(976) 0x433668 ADD X1, X1, X28 |
(976) 0x43366c LDR D21, [X1] |
(976) 0x433670 FMADD D21, D28, D30, D21 |
(976) 0x433674 STR D21, [X1] |
(976) 0x433678 ADD X1, X1, X28 |
(976) 0x43367c LDR D21, [X1] |
(976) 0x433680 FMADD D21, D31, D27, D21 |
(976) 0x433684 FMOV D31, D2 |
(976) 0x433688 STR D21, [X1] |
(976) 0x43368c LDR D21, [X1, X28] |
(976) 0x433690 FMADD D20, D20, D25, D21 |
(976) 0x433694 LDR D21, [X26, X30] |
(976) 0x433698 STR D20, [X1, X28] |
(976) 0x43369c FMADD D21, D30, D25, D21 |
(976) 0x4336a0 STR D21, [X26, X30] |
(976) 0x4336a4 B.NE 4335a0 |
(977) 0x4336a8 SUB X9, X29, #64 |
(977) 0x4336ac LDR Z4, [X9, #511, MUL VL] |
(977) 0x4336b0 B 4333a4 |
/home/hbollore/qaas-runs/171-284-6744/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 226 - 262 |
-------------------------------------------------------------------------------- |
226: for (int i = 0; i < 4; i++) |
227: for (int j = 0; j < 4; j++) |
[...] |
234: const T pre20 = d2a[i] * b[j]; |
235: const T pre10 = da[i] * b[j]; |
236: const T pre00 = a[i] * b[j]; |
237: const T pre11 = da[i] * db[j]; |
238: const T pre01 = a[i] * db[j]; |
239: const T pre02 = a[i] * d2b[j]; |
240: |
241: const int iSplitPoint = num_splines; |
242: for (int n = 0; n < iSplitPoint; n++) |
243: { |
244: T coefsv = coefs[n]; |
245: T coefsvzs = coefszs[n]; |
246: T coefsv2zs = coefs2zs[n]; |
247: T coefsv3zs = coefs3zs[n]; |
248: |
249: T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs; |
250: T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs; |
251: T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs; |
252: |
253: hxx[n] += pre20 * sum0; |
254: hxy[n] += pre11 * sum0; |
255: hxz[n] += pre10 * sum1; |
256: hyy[n] += pre02 * sum0; |
257: hyz[n] += pre01 * sum1; |
258: hzz[n] += pre00 * sum2; |
259: gx[n] += pre10 * sum0; |
260: gy[n] += pre01 * sum0; |
261: gz[n] += pre00 * sum1; |
262: vals[n] += pre00 * sum0; |
Coverage (%) | Name | Source Location | Module |
---|
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.67 - 6.67 |
Bottlenecks | P6, P8, |
Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
Source | MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.00 - 20.00 |
CQA cycles if no scalar integer | 5.00 - 20.00 |
CQA cycles if FP arith vectorized | 5.00 - 20.00 |
CQA cycles if fully vectorized | 5.00 - 20.00 |
Front-end cycles | 2.75 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 1.75 |
P2 cycles | 1.75 |
P3 cycles | 3.00 |
P4 cycles | 1.50 |
P5 cycles | 3.00 |
P6 cycles | 3.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 2.33 |
P10 cycles | 2.33 |
P11 cycles | 2.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 5.00 - 20.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 22.00 |
Nb uops | 22.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 - 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.40 - 33.60 |
Bytes prefetched | 0.00 |
Bytes loaded | 168.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 75.00 |
Vectorization ratio load | 85.71 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 85.71 |
Vector-efficiency ratio all | 75.00 |
Vector-efficiency ratio load | 75.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | 25.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 89.29 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.67 - 6.67 |
Bottlenecks | P6, P8, |
Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
Source | MultiBsplineRef.hpp:242-242 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.00 - 20.00 |
CQA cycles if no scalar integer | 5.00 - 20.00 |
CQA cycles if FP arith vectorized | 5.00 - 20.00 |
CQA cycles if fully vectorized | 5.00 - 20.00 |
Front-end cycles | 2.75 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 1.75 |
P2 cycles | 1.75 |
P3 cycles | 3.00 |
P4 cycles | 1.50 |
P5 cycles | 3.00 |
P6 cycles | 3.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 2.33 |
P10 cycles | 2.33 |
P11 cycles | 2.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 5.00 - 20.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 22.00 |
Nb uops | 22.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 - 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.40 - 33.60 |
Bytes prefetched | 0.00 |
Bytes loaded | 168.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 75.00 |
Vectorization ratio load | 85.71 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 85.71 |
Vector-efficiency ratio all | 75.00 |
Vector-efficiency ratio load | 75.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | 25.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 89.29 |
Path / |
Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
Source file and lines | MultiBsplineRef.hpp:226-262 |
Module | exec |
nb instructions | 22 |
loop length | 88 |
nb stack references | 0 |
front end | 2.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 1.75 | 1.75 | 3.00 | 1.50 | 3.00 | 3.00 | 0.00 | 0.00 | 2.33 | 2.33 | 2.33 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 1.75 | 1.75 | 3.00 | 1.50 | 3.00 | 3.00 | 0.00 | 0.00 | 2.33 | 2.33 | 2.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 5.00-20.00 |
Front-end | 2.75 |
Overall L1 | 5.00-20.00 |
all | 75% |
load | 85% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 85% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
UDIV X13, X12, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-20 | 5-20 |
ORR X9, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
DUP Z29.D, Z23.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z30.D, Z26.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z31.D, Z24.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z20.D, Z28.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z21.D, Z27.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z22.D, Z25.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
UMSUBL X14, W13, W8, X12 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
UMADDL X5, W13, W8, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z0, [X13, #510, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z1, [X13, #509, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z2, [X13, #508, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z3, [X13, #507, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LDP D15, D8, [SP, #192] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
LDP D1, D31, [SP, #176] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
LDR D3, [SP, #168] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
CBZ X14, 4333a4 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x944> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
Source file and lines | MultiBsplineRef.hpp:226-262 |
Module | exec |
nb instructions | 22 |
loop length | 88 |
nb stack references | 0 |
front end | 2.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 1.75 | 1.75 | 3.00 | 1.50 | 3.00 | 3.00 | 0.00 | 0.00 | 2.33 | 2.33 | 2.33 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 1.75 | 1.75 | 3.00 | 1.50 | 3.00 | 3.00 | 0.00 | 0.00 | 2.33 | 2.33 | 2.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 5.00-20.00 |
Front-end | 2.75 |
Overall L1 | 5.00-20.00 |
all | 75% |
load | 85% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 85% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
UDIV X13, X12, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-20 | 5-20 |
ORR X9, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
DUP Z29.D, Z23.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z30.D, Z26.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z31.D, Z24.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z20.D, Z28.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z21.D, Z27.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
DUP Z22.D, Z25.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
UMSUBL X14, W13, W8, X12 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
UMADDL X5, W13, W8, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z0, [X13, #510, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z1, [X13, #509, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z2, [X13, #508, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
SUB X13, X29, #64 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR Z3, [X13, #507, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LDP D15, D8, [SP, #192] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
LDP D1, D31, [SP, #176] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
LDR D3, [SP, #168] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
CBZ X14, 4333a4 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x944> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |