// Copyright 2023 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Must be included inside an existing include guard, with the following ops // already defined: BitCast, And, Set, ShiftLeft, ShiftRight, PromoteLowerTo, // ConcatEven, ConcatOdd, plus the optional detail::PromoteEvenTo and // detail::PromoteOddTo (if implemented in the target-specific header). // This is normally set by set_macros-inl.h before this header is included; // if not, we are viewing this header standalone. Reduce IDE errors by: #if !defined(HWY_NAMESPACE) // 1) Defining HWY_IDE so we get syntax highlighting rather than all-gray text. #include "hwy/ops/shared-inl.h" // 2) Entering the HWY_NAMESPACE to make definitions from shared-inl.h visible. HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { #define HWY_INSIDE_END_NAMESPACE // 3) Providing a dummy VFromD (usually done by the target-specific header). template using VFromD = int; template using TFromV = int; template struct DFromV {}; #endif // ------------------------------ Vec/Create/Get/Set2..4 // On SVE and RVV, Vec2..4 are aliases to built-in types. Also exclude the // fixed-size SVE targets. #if HWY_IDE || (!HWY_HAVE_SCALABLE && !HWY_TARGET_IS_SVE) // NOTE: these are used inside arm_neon-inl.h, hence they cannot be defined in // generic_ops-inl.h, which is included after that. template struct Vec2 { VFromD v0; VFromD v1; }; template struct Vec3 { VFromD v0; VFromD v1; VFromD v2; }; template struct Vec4 { VFromD v0; VFromD v1; VFromD v2; VFromD v3; }; // D arg is unused but allows deducing D. template HWY_API Vec2 Create2(D /* tag */, VFromD v0, VFromD v1) { return Vec2{v0, v1}; } template HWY_API Vec3 Create3(D /* tag */, VFromD v0, VFromD v1, VFromD v2) { return Vec3{v0, v1, v2}; } template HWY_API Vec4 Create4(D /* tag */, VFromD v0, VFromD v1, VFromD v2, VFromD v3) { return Vec4{v0, v1, v2, v3}; } template HWY_API VFromD Get2(Vec2 tuple) { static_assert(kIndex < 2, "Tuple index out of bounds"); return kIndex == 0 ? tuple.v0 : tuple.v1; } template HWY_API VFromD Get3(Vec3 tuple) { static_assert(kIndex < 3, "Tuple index out of bounds"); return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2; } template HWY_API VFromD Get4(Vec4 tuple) { static_assert(kIndex < 4, "Tuple index out of bounds"); return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : kIndex == 2 ? tuple.v2 : tuple.v3; } template HWY_API Vec2 Set2(Vec2 tuple, VFromD val) { static_assert(kIndex < 2, "Tuple index out of bounds"); if (kIndex == 0) { tuple.v0 = val; } else { tuple.v1 = val; } return tuple; } template HWY_API Vec3 Set3(Vec3 tuple, VFromD val) { static_assert(kIndex < 3, "Tuple index out of bounds"); if (kIndex == 0) { tuple.v0 = val; } else if (kIndex == 1) { tuple.v1 = val; } else { tuple.v2 = val; } return tuple; } template HWY_API Vec4 Set4(Vec4 tuple, VFromD val) { static_assert(kIndex < 4, "Tuple index out of bounds"); if (kIndex == 0) { tuple.v0 = val; } else if (kIndex == 1) { tuple.v1 = val; } else if (kIndex == 2) { tuple.v2 = val; } else { tuple.v3 = val; } return tuple; } #endif // !HWY_HAVE_SCALABLE || HWY_IDE // ------------------------------ Rol/Ror (And, Or, Neg, Shl, Shr) #if (defined(HWY_NATIVE_ROL_ROR_8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ROL_ROR_8 #undef HWY_NATIVE_ROL_ROR_8 #else #define HWY_NATIVE_ROL_ROR_8 #endif template )> HWY_API V Rol(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint8_t{7}); const auto shl_amt = And(BitCast(du, b), shift_amt_mask); const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } template )> HWY_API V Ror(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint8_t{7}); const auto shr_amt = And(BitCast(du, b), shift_amt_mask); const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } #endif // HWY_NATIVE_ROL_ROR_8 #if (defined(HWY_NATIVE_ROL_ROR_16) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ROL_ROR_16 #undef HWY_NATIVE_ROL_ROR_16 #else #define HWY_NATIVE_ROL_ROR_16 #endif template )> HWY_API V Rol(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint16_t{15}); const auto shl_amt = And(BitCast(du, b), shift_amt_mask); const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } template )> HWY_API V Ror(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint16_t{15}); const auto shr_amt = And(BitCast(du, b), shift_amt_mask); const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } #endif // HWY_NATIVE_ROL_ROR_16 #if (defined(HWY_NATIVE_ROL_ROR_32_64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ROL_ROR_32_64 #undef HWY_NATIVE_ROL_ROR_32_64 #else #define HWY_NATIVE_ROL_ROR_32_64 #endif template )> HWY_API V Rol(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint32_t{31}); const auto shl_amt = And(BitCast(du, b), shift_amt_mask); const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } template )> HWY_API V Ror(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint32_t{31}); const auto shr_amt = And(BitCast(du, b), shift_amt_mask); const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } #if HWY_HAVE_INTEGER64 template )> HWY_API V Rol(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint64_t{63}); const auto shl_amt = And(BitCast(du, b), shift_amt_mask); const auto shr_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } template )> HWY_API V Ror(V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const auto shift_amt_mask = Set(du, uint64_t{63}); const auto shr_amt = And(BitCast(du, b), shift_amt_mask); const auto shl_amt = And(BitCast(du, Neg(BitCast(di, b))), shift_amt_mask); const auto vu = BitCast(du, a); return BitCast(d, Or(Shl(vu, shl_amt), Shr(vu, shr_amt))); } #endif // HWY_HAVE_INTEGER64 #endif // HWY_NATIVE_ROL_ROR_32_64 // ------------------------------ RotateLeftSame/RotateRightSame #if (defined(HWY_NATIVE_ROL_ROR_SAME_8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ROL_ROR_SAME_8 #undef HWY_NATIVE_ROL_ROR_SAME_8 #else #define HWY_NATIVE_ROL_ROR_SAME_8 #endif template )> HWY_API V RotateLeftSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shl_amt = bits & 7; const int shr_amt = static_cast((0u - static_cast(bits)) & 7u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } template )> HWY_API V RotateRightSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shr_amt = bits & 7; const int shl_amt = static_cast((0u - static_cast(bits)) & 7u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } #endif // HWY_NATIVE_ROL_ROR_SAME_8 #if (defined(HWY_NATIVE_ROL_ROR_SAME_16) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ROL_ROR_SAME_16 #undef HWY_NATIVE_ROL_ROR_SAME_16 #else #define HWY_NATIVE_ROL_ROR_SAME_16 #endif template )> HWY_API V RotateLeftSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shl_amt = bits & 15; const int shr_amt = static_cast((0u - static_cast(bits)) & 15u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } template )> HWY_API V RotateRightSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shr_amt = bits & 15; const int shl_amt = static_cast((0u - static_cast(bits)) & 15u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } #endif // HWY_NATIVE_ROL_ROR_SAME_16 #if (defined(HWY_NATIVE_ROL_ROR_SAME_32_64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ROL_ROR_SAME_32_64 #undef HWY_NATIVE_ROL_ROR_SAME_32_64 #else #define HWY_NATIVE_ROL_ROR_SAME_32_64 #endif template )> HWY_API V RotateLeftSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shl_amt = bits & 31; const int shr_amt = static_cast((0u - static_cast(bits)) & 31u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } template )> HWY_API V RotateRightSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shr_amt = bits & 31; const int shl_amt = static_cast((0u - static_cast(bits)) & 31u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } #if HWY_HAVE_INTEGER64 template )> HWY_API V RotateLeftSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shl_amt = bits & 63; const int shr_amt = static_cast((0u - static_cast(bits)) & 63u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } template )> HWY_API V RotateRightSame(V v, int bits) { const DFromV d; const RebindToUnsigned du; const int shr_amt = bits & 63; const int shl_amt = static_cast((0u - static_cast(bits)) & 63u); const auto vu = BitCast(du, v); return BitCast(d, Or(ShiftLeftSame(vu, shl_amt), ShiftRightSame(vu, shr_amt))); } #endif // HWY_HAVE_INTEGER64 #endif // HWY_NATIVE_ROL_ROR_SAME_32_64 // ------------------------------ PromoteEvenTo/PromoteOddTo // These are used by target-specific headers for ReorderWidenMulAccumulate etc. #if HWY_TARGET != HWY_SCALAR || HWY_IDE namespace detail { // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as // there are target-specific specializations for some of the // detail::PromoteEvenTo and detail::PromoteOddTo cases on // SVE/PPC/SSE2/SSSE3/SSE4/AVX2. // All targets except HWY_SCALAR use the implementations of // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at // least some of the PromoteEvenTo and PromoteOddTo cases. // Signed to signed PromoteEvenTo/PromoteOddTo template HWY_INLINE VFromD PromoteEvenTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_TARGET_IS_SVE // The intrinsic expects the wide lane type. return NativePromoteEvenTo(BitCast(d_to, v)); #else #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, need to shift each lane of the bitcasted // vector left by kToLaneSize * 4 bits to get the bits of the even // source lanes into the upper kToLaneSize * 4 bits of even_in_hi. const auto even_in_hi = ShiftLeft(BitCast(d_to, v)); #else // On big-endian targets, the bits of the even source lanes are already // in the upper kToLaneSize * 4 bits of the lanes of the bitcasted // vector. const auto even_in_hi = BitCast(d_to, v); #endif // Right-shift even_in_hi by kToLaneSize * 4 bits return ShiftRight(even_in_hi); #endif // HWY_TARGET_IS_SVE } // Unsigned to unsigned PromoteEvenTo/PromoteOddTo template HWY_INLINE VFromD PromoteEvenTo( hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_TARGET_IS_SVE // The intrinsic expects the wide lane type. return NativePromoteEvenTo(BitCast(d_to, v)); #else #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, the bits of the even source lanes are already // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. // Simply need to zero out the upper bits of each lane of the bitcasted // vector. return And(BitCast(d_to, v), Set(d_to, static_cast>(LimitsMax>()))); #else // On big-endian targets, need to shift each lane of the bitcasted vector // right by kToLaneSize * 4 bits to get the bits of the even source lanes into // the lower kToLaneSize * 4 bits of the result. // The right shift below will zero out the upper kToLaneSize * 4 bits of the // result. return ShiftRight(BitCast(d_to, v)); #endif #endif // HWY_TARGET_IS_SVE } template HWY_INLINE VFromD PromoteOddTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, the bits of the odd source lanes are already in // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. const auto odd_in_hi = BitCast(d_to, v); #else // On big-endian targets, need to shift each lane of the bitcasted vector // left by kToLaneSize * 4 bits to get the bits of the odd source lanes into // the upper kToLaneSize * 4 bits of odd_in_hi. const auto odd_in_hi = ShiftLeft(BitCast(d_to, v)); #endif // Right-shift odd_in_hi by kToLaneSize * 4 bits return ShiftRight(odd_in_hi); } template HWY_INLINE VFromD PromoteOddTo( hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, need to shift each lane of the bitcasted vector // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into // the lower kToLaneSize * 4 bits of the result. // The right shift below will zero out the upper kToLaneSize * 4 bits of the // result. return ShiftRight(BitCast(d_to, v)); #else // On big-endian targets, the bits of the even source lanes are already // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. // Simply need to zero out the upper bits of each lane of the bitcasted // vector. return And(BitCast(d_to, v), Set(d_to, static_cast>(LimitsMax>()))); #endif } // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo // followed by BitCast to signed template HWY_INLINE VFromD PromoteEvenTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { const RebindToUnsigned du_to; return BitCast(d_to, PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag(), hwy::UnsignedTag(), du_to, v)); } template HWY_INLINE VFromD PromoteOddTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { const RebindToUnsigned du_to; return BitCast(d_to, PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag(), hwy::UnsignedTag(), du_to, v)); } // BF16->F32 PromoteEvenTo // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag // instead of hwy::FloatTag on targets that use scalable vectors. // VBF16 is considered to be a bfloat16_t vector if TFromV is the same // type as TFromV>> // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered // to be a bfloat16_t vector. template >, hwy::EnableIf, TFromV>()>* = nullptr> HWY_INLINE VFromD PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<4> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, DF32 d_to, VBF16 v) { const RebindToUnsigned du_to; #if HWY_IS_LITTLE_ENDIAN // On little-endian platforms, need to shift left each lane of the bitcasted // vector by 16 bits. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); #else // On big-endian platforms, the even lanes of the source vector are already // in the upper 16 bits of the lanes of the bitcasted vector. // Need to simply zero out the lower 16 bits of each lane of the bitcasted // vector. return BitCast(d_to, And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); #endif } // BF16->F32 PromoteOddTo // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag // instead of hwy::FloatTag on targets that use scalable vectors. // VBF16 is considered to be a bfloat16_t vector if TFromV is the same // type as TFromV>> // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered // to be a bfloat16_t vector. template >, hwy::EnableIf, TFromV>()>* = nullptr> HWY_INLINE VFromD PromoteOddTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<4> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, DF32 d_to, VBF16 v) { const RebindToUnsigned du_to; #if HWY_IS_LITTLE_ENDIAN // On little-endian platforms, the odd lanes of the source vector are already // in the upper 16 bits of the lanes of the bitcasted vector. // Need to simply zero out the lower 16 bits of each lane of the bitcasted // vector. return BitCast(d_to, And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); #else // On big-endian platforms, need to shift left each lane of the bitcasted // vector by 16 bits. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); #endif } // Default PromoteEvenTo/PromoteOddTo implementations template HWY_INLINE VFromD PromoteEvenTo( ToTypeTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { return PromoteLowerTo(d_to, v); } template HWY_INLINE VFromD PromoteEvenTo( ToTypeTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { const DFromV d; return PromoteLowerTo(d_to, ConcatEven(d, v, v)); } template HWY_INLINE VFromD PromoteOddTo( ToTypeTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { const DFromV d; return PromoteLowerTo(d_to, ConcatOdd(d, v, v)); } } // namespace detail template )), class V2 = VFromD, D>>, HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(V2))> HWY_API VFromD PromoteEvenTo(D d, V v) { return detail::PromoteEvenTo(hwy::TypeTag>(), hwy::SizeTag)>(), hwy::TypeTag>(), d, v); } template )), class V2 = VFromD, D>>, HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(V2))> HWY_API VFromD PromoteOddTo(D d, V v) { return detail::PromoteOddTo(hwy::TypeTag>(), hwy::SizeTag)>(), hwy::TypeTag>(), d, v); } #endif // HWY_TARGET != HWY_SCALAR #ifdef HWY_INSIDE_END_NAMESPACE #undef HWY_INSIDE_END_NAMESPACE // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #endif