# benchmark
op

# naive C with openmp
for for for

# unroll, first try
h

# register allocation
kernels

# unroll, second try
simd

# neon intrinsics
optional

# naive neon assembly with pld
asm

# pipeline optimize, first try
more register load mla

# pipeline optimize, second try
interleave load mla

# pipeline optimize, third try
loop tail

# usual practice, load/save
233

# usual practice, unroll
233

# usual practice, save register
233