enclose-io/compiler

View on GitHub
lts/deps/zlib/contrib/optimizations/slide_hash_neon.h

Summary

Maintainability
Test Coverage
/* Copyright 2018 The Chromium Authors. All rights reserved.
 * Use of this source code is governed by a BSD-style license that can be
 * found in the Chromium source repository LICENSE file.
 */
#ifndef __SLIDE_HASH__NEON__
#define __SLIDE_HASH__NEON__

#include "deflate.h"
#include <arm_neon.h>

inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
                                                        const uInt hash_size,
                                                        const ush w_size)
{
   /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
     * values. For further details, check:
     * ARM DHT 0002A, section 1.3.2 NEON Registers.
     */
    const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
    /* Unrolling the operation yielded a compression performance boost in both
     * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
     * content. For full benchmarking data, check: http://crbug.com/863257.
     */
    const size_t stride = 2*chunk;
    const uint16x8_t v = vdupq_n_u16(w_size);

    for (Posf *end = hash + hash_size; hash != end; hash += stride) {
        uint16x8_t m_low = vld1q_u16(hash);
        uint16x8_t m_high = vld1q_u16(hash + chunk);

        /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
         * replacing the ternary operator expression in the original code:
         * (m >= wsize ? m - wsize : NIL).
         */
        m_low = vqsubq_u16(m_low, v);
        m_high = vqsubq_u16(m_high, v);

        vst1q_u16(hash, m_low);
        vst1q_u16(hash + chunk, m_high);
    }
}


inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
                                                 const unsigned short w_size,
                                                 const uInt hash_size)
{
    /*
     * SIMD implementation for hash table rebase assumes:
     * 1. hash chain offset (Pos) is 2 bytes.
     * 2. hash table size is multiple of 32 bytes.
     * #1 should be true as Pos is defined as "ush"
     * #2 should be true as hash_bits are greater than 7
     */
    const size_t size = hash_size * sizeof(head[0]);
    Assert(sizeof(Pos) == 2, "Wrong Pos size.");
    Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");

    neon_slide_hash_update(head, hash_size, w_size);
#ifndef FASTEST
    neon_slide_hash_update(prev, w_size, w_size);
#endif
}

#endif