Universal intrinsics
Overview
“Universal intrinsics” is a types and functions set intended to simplify vectorization of code on different platforms. More…
// typedefs typedef v_reg<float, 4> cv::v_float32x4; typedef v_reg<double, 2> cv::v_float64x2; typedef v_reg<short, 8> cv::v_int16x8; typedef v_reg<int, 4> cv::v_int32x4; typedef v_reg<int64, 2> cv::v_int64x2; typedef v_reg<schar, 16> cv::v_int8x16; typedef v_reg<ushort, 8> cv::v_uint16x8; typedef v_reg<unsigned, 4> cv::v_uint32x4; typedef v_reg<uint64, 2> cv::v_uint64x2; typedef v_reg<uchar, 16> cv::v_uint8x16; // structs template < typename _Tp, int n > struct cv::v_reg; // global variables static const unsigned char cv::popCountTable[]; // global functions v_uint8x16 cv::v_setzero_u8(); v_int8x16 cv::v_setzero_s8(); v_uint16x8 cv::v_setzero_u16(); v_int16x8 cv::v_setzero_s16(); v_uint32x4 cv::v_setzero_u32(); v_int32x4 cv::v_setzero_s32(); v_float32x4 cv::v_setzero_f32(); v_float64x2 cv::v_setzero_f64(); v_uint64x2 cv::v_setzero_u64(); v_int64x2 cv::v_setzero_s64(); v_uint8x16 cv::v_setall_u8(uchar val); v_int8x16 cv::v_setall_s8(schar val); v_uint16x8 cv::v_setall_u16(ushort val); v_int16x8 cv::v_setall_s16(short val); v_uint32x4 cv::v_setall_u32(unsigned val); v_int32x4 cv::v_setall_s32(int val); v_float32x4 cv::v_setall_f32(float val); v_float64x2 cv::v_setall_f64(double val); v_uint64x2 cv::v_setall_u64(uint64 val); v_int64x2 cv::v_setall_s64(int64 val); template < typename _Tp0, int n0 > v_uint8x16 cv::v_reinterpret_as_u8(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_int8x16 cv::v_reinterpret_as_s8(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_uint16x8 cv::v_reinterpret_as_u16(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_int16x8 cv::v_reinterpret_as_s16(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_uint32x4 cv::v_reinterpret_as_u32(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_int32x4 cv::v_reinterpret_as_s32(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_float32x4 cv::v_reinterpret_as_f32(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_float64x2 cv::v_reinterpret_as_f64(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_uint64x2 cv::v_reinterpret_as_u64(const v_reg<_Tp0, n0>& a); template < typename _Tp0, int n0 > v_int64x2 cv::v_reinterpret_as_s64(const v_reg<_Tp0, n0>& a); template <int n> v_uint16x8 cv::v_shl(const v_uint16x8& a); template <int n> v_int16x8 cv::v_shl(const v_int16x8& a); template <int n> v_uint32x4 cv::v_shl(const v_uint32x4& a); template <int n> v_int32x4 cv::v_shl(const v_int32x4& a); template <int n> v_uint64x2 cv::v_shl(const v_uint64x2& a); template <int n> v_int64x2 cv::v_shl(const v_int64x2& a); template <int n> v_uint16x8 cv::v_shr(const v_uint16x8& a); template <int n> v_int16x8 cv::v_shr(const v_int16x8& a); template <int n> v_uint32x4 cv::v_shr(const v_uint32x4& a); template <int n> v_int32x4 cv::v_shr(const v_int32x4& a); template <int n> v_uint64x2 cv::v_shr(const v_uint64x2& a); template <int n> v_int64x2 cv::v_shr(const v_int64x2& a); template <int n> v_uint16x8 cv::v_rshr(const v_uint16x8& a); template <int n> v_int16x8 cv::v_rshr(const v_int16x8& a); template <int n> v_uint32x4 cv::v_rshr(const v_uint32x4& a); template <int n> v_int32x4 cv::v_rshr(const v_int32x4& a); template <int n> v_uint64x2 cv::v_rshr(const v_uint64x2& a); template <int n> v_int64x2 cv::v_rshr(const v_int64x2& a); v_uint8x16 cv::v_pack( const v_uint16x8& a, const v_uint16x8& b ); v_int8x16 cv::v_pack( const v_int16x8& a, const v_int16x8& b ); v_uint16x8 cv::v_pack( const v_uint32x4& a, const v_uint32x4& b ); v_int16x8 cv::v_pack( const v_int32x4& a, const v_int32x4& b ); v_uint32x4 cv::v_pack( const v_uint64x2& a, const v_uint64x2& b ); v_int32x4 cv::v_pack( const v_int64x2& a, const v_int64x2& b ); v_uint8x16 cv::v_pack_u( const v_int16x8& a, const v_int16x8& b ); v_uint16x8 cv::v_pack_u( const v_int32x4& a, const v_int32x4& b ); template <int n> v_uint8x16 cv::v_rshr_pack( const v_uint16x8& a, const v_uint16x8& b ); template <int n> v_int8x16 cv::v_rshr_pack( const v_int16x8& a, const v_int16x8& b ); template <int n> v_uint16x8 cv::v_rshr_pack( const v_uint32x4& a, const v_uint32x4& b ); template <int n> v_int16x8 cv::v_rshr_pack( const v_int32x4& a, const v_int32x4& b ); template <int n> v_uint32x4 cv::v_rshr_pack( const v_uint64x2& a, const v_uint64x2& b ); template <int n> v_int32x4 cv::v_rshr_pack( const v_int64x2& a, const v_int64x2& b ); template <int n> v_uint8x16 cv::v_rshr_pack_u( const v_int16x8& a, const v_int16x8& b ); template <int n> v_uint16x8 cv::v_rshr_pack_u( const v_int32x4& a, const v_int32x4& b ); void cv::v_pack_store( uchar* ptr, const v_uint16x8& a ); void cv::v_pack_store( schar* ptr, const v_int16x8& a ); void cv::v_pack_store( ushort* ptr, const v_uint32x4& a ); void cv::v_pack_store( short* ptr, const v_int32x4& a ); void cv::v_pack_store( unsigned* ptr, const v_uint64x2& a ); void cv::v_pack_store( int* ptr, const v_int64x2& a ); void cv::v_pack_u_store( uchar* ptr, const v_int16x8& a ); void cv::v_pack_u_store( ushort* ptr, const v_int32x4& a ); template <int n> void cv::v_rshr_pack_store( uchar* ptr, const v_uint16x8& a ); template <int n> void cv::v_rshr_pack_store( schar* ptr, const v_int16x8& a ); template <int n> void cv::v_rshr_pack_store( ushort* ptr, const v_uint32x4& a ); template <int n> void cv::v_rshr_pack_store( short* ptr, const v_int32x4& a ); template <int n> void cv::v_rshr_pack_store( unsigned* ptr, const v_uint64x2& a ); template <int n> void cv::v_rshr_pack_store( int* ptr, const v_int64x2& a ); template <int n> void cv::v_rshr_pack_u_store( uchar* ptr, const v_int16x8& a ); template <int n> void cv::v_rshr_pack_u_store( ushort* ptr, const v_int32x4& a ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator!=( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator&( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator&=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator*( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator*=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator+( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator+=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator-( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator-=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator::( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator::=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator<( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator<<( const v_reg<_Tp, n>& a, int imm ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator<=( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator==( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator>( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator>=( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator>>( const v_reg<_Tp, n>& a, int imm ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator^( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator^=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator|( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n>& cv::operator|=( v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::operator~(const v_reg<_Tp, n>& a); template < typename _Tp, int n > v_reg<typename V_TypeTraits<_Tp>::abs_type, n> cv::v_abs(const v_reg<_Tp, n>& a); template < typename _Tp, int n > v_reg<typename V_TypeTraits<_Tp>::abs_type, n> cv::v_absdiff( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); v_float32x4 cv::v_absdiff( const v_float32x4& a, const v_float32x4& b ); v_float64x2 cv::v_absdiff( const v_float64x2& a, const v_float64x2& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_add_wrap( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<int, n> cv::v_ceil(const v_reg<_Tp, n>& a); template <int n> v_reg<int, n> cv::v_ceil(const v_reg<float, n>& a); template <int n> v_reg<int, n*2> cv::v_ceil(const v_reg<double, n>& a); template < typename _Tp, int n > bool cv::v_check_all(const v_reg<_Tp, n>& a); template < typename _Tp, int n > bool cv::v_check_any(const v_reg<_Tp, n>& a); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_combine_high( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_combine_low( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template <int n> v_reg<float, n> cv::v_cvt_f32(const v_reg<int, n>& a); template <int n> v_reg<double, n> cv::v_cvt_f64(const v_reg<int, n*2>& a); template <int n> v_reg<double, n> cv::v_cvt_f64(const v_reg<float, n*2>& a); template < typename _Tp, int n > v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> cv::v_dotprod( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > void cv::v_expand( const v_reg<_Tp, n>& a, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1 ); template < int s, typename _Tp, int n > v_reg<_Tp, n> cv::v_extract( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<int, n> cv::v_floor(const v_reg<_Tp, n>& a); template <int n> v_reg<int, n> cv::v_floor(const v_reg<float, n>& a); template <int n> v_reg<int, n*2> cv::v_floor(const v_reg<double, n>& a); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_invsqrt(const v_reg<_Tp, n>& a); template <typename _Tp> v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> cv::v_load(const _Tp* ptr); template <typename _Tp> v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> cv::v_load_aligned(const _Tp* ptr); template < typename _Tp, int n > void cv::v_load_deinterleave( const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b ); template < typename _Tp, int n > void cv::v_load_deinterleave( const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b, v_reg<_Tp, n>& c ); template < typename _Tp, int n > void cv::v_load_deinterleave( const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, v_reg<_Tp, n>& d ); template <typename _Tp> v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes/2> cv::v_load_expand(const _Tp* ptr); template <typename _Tp> v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes/4> cv::v_load_expand_q(const _Tp* ptr); template <typename _Tp> v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> cv::v_load_halves( const _Tp* loptr, const _Tp* hiptr ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_magnitude( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); v_float32x4 cv::v_matmul( const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3 ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_max( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_min( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > void cv::v_mul_expand( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_muladd( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c ); template < typename _Tp, int n > v_uint32x4 cv::v_popcount(const v_reg<_Tp, n>& a); template < typename _Tp, int n > void cv::v_recombine( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg<_Tp, n>& low, v_reg<_Tp, n>& high ); template < typename _Tp, int n > _Tp cv::v_reduce_max(const v_reg<_Tp, n>& a); template < typename _Tp, int n > _Tp cv::v_reduce_min(const v_reg<_Tp, n>& a); template < typename _Tp, int n > V_TypeTraits<_Tp>::sum_type cv::v_reduce_sum(const v_reg<_Tp, n>& a); v_float32x4 cv::v_reduce_sum4( const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d ); template < typename _Tp, int n > v_reg<int, n> cv::v_round(const v_reg<_Tp, n>& a); template <int n> v_reg<int, n> cv::v_round(const v_reg<float, n>& a); template <int n> v_reg<int, n*2> cv::v_round(const v_reg<double, n>& a); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_select( const v_reg<_Tp, n>& mask, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > int cv::v_signmask(const v_reg<_Tp, n>& a); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_sqr_magnitude( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_sqrt(const v_reg<_Tp, n>& a); template < typename _Tp, int n > void cv::v_store( _Tp* ptr, const v_reg<_Tp, n>& a ); template < typename _Tp, int n > void cv::v_store_aligned( _Tp* ptr, const v_reg<_Tp, n>& a ); template < typename _Tp, int n > void cv::v_store_high( _Tp* ptr, const v_reg<_Tp, n>& a ); template < typename _Tp, int n > void cv::v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template < typename _Tp, int n > void cv::v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c ); template < typename _Tp, int n > void cv::v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, const v_reg<_Tp, n>& d ); template < typename _Tp, int n > void cv::v_store_low( _Tp* ptr, const v_reg<_Tp, n>& a ); template < typename _Tp, int n > v_reg<_Tp, n> cv::v_sub_wrap( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b ); template <typename _Tp> void cv::v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 ); template < typename _Tp, int n > v_reg<int, n> cv::v_trunc(const v_reg<_Tp, n>& a); template <int n> v_reg<int, n> cv::v_trunc(const v_reg<float, n>& a); template <int n> v_reg<int, n*2> cv::v_trunc(const v_reg<double, n>& a); template < typename _Tp, int n > void cv::v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 ); // macros #define CV_SIMD128 #define CV_SIMD128_64F
Detailed Documentation
“Universal intrinsics” is a types and functions set intended to simplify vectorization of code on different platforms. Currently there are two supported SIMD extensions: SSE/SSE2 on x86 architectures and NEON on ARM architectures, both allow working with 128 bit registers containing packed values of different types. In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics will be chosen and code will work as expected although it could be slower.
Types
There are several types representing 128-bit register as a vector of packed values, each type is implemented as a structure based on a one SIMD register.
- cv::v_uint8x16 and cv::v_int8x16 : sixteen 8-bit integer values (unsigned/signed) - char
- cv::v_uint16x8 and cv::v_int16x8 : eight 16-bit integer values (unsigned/signed) - short
- cv::v_uint32x4 and cv::v_int32x4 : four 32-bit integer values (unsgined/signed) - int
- cv::v_uint64x2 and cv::v_int64x2 : two 64-bit integer values (unsigned/signed) - int64
- cv::v_float32x4 : four 32-bit floating point values (signed) - float
- cv::v_float64x2 : two 64-bit floating point valies (signed) - double
cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don’t forget to check the CV_SIMD128_64F preprocessor definition:
#if CV_SIMD128_64F //... #endif
Load and store operations
These operations allow to set contents of the register explicitly or by loading it from some memory block and to save contents of the register to memory block.
- Constructors: from memory, from two values, …
- Other create methods: v_setall_s8, v_setall_u8, …, v_setzero_u8, v_setzero_s8, …
- Memory operations: v_load, v_load_aligned, v_load_halves, v_store, v_store_aligned, v_store_high, v_store_low
Value reordering
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (2, 3 and 4 channels): v_load_deinterleave, v_store_interleave
- Expand: v_load_expand, v_load_expand_q, v_expand
- Pack: v_pack, v_pack_u, v_rshr_pack, v_rshr_pack_u, v_pack_store, v_pack_u_store, v_rshr_pack_store, v_rshr_pack_u_store
- Recombine: v_zip, v_recombine, v_combine_low, v_combine_high
- Extract: v_extract
Arithmetic, bitwise and comparison operations
Element-wise binary and unary operations.
- Arithmetics: +, -, *, /, v_mul_expand
- Non-saturating arithmetics: v_add_wrap, v_sub_wrap
- Bitwise shifts: <<, >>, v_shl, v_shr
- Bitwise logic: &, |, ^, ~
- Comparison: >, >=, <, <=, ==, = <doxid-df/d91/group__core__hal__intrin_1gacfed05378c432f84c46e3a2e80e321e8>
- min/max: v_min, v_max
Reduce and mask
Most of these operations return only one value.
- Reduce: v_reduce_min, v_reduce_max, v_reduce_sum, v_popcount
- Mask: v_signmask, v_check_all, v_check_any, v_select
Other math
- Some frequent operations: v_sqrt, v_invsqrt, v_magnitude, v_sqr_magnitude
- Absolute values: v_abs, v_absdiff
Conversions
Different type conversions and casts:
- Rounding: v_round, v_floor, v_ceil, v_trunc,
- To float: v_cvt_f32, v_cvt_f64
- Reinterpret: v_reinterpret_as_u8, v_reinterpret_as_s8, …
Matrix operations
In these operations vectors represent matrix rows/columns: v_dotprod, v_matmul, v_transpose4x4
Usability
Most operations are implemented only for some subset of the available types, following matrices shows the applicability of different operations to the types.
Regular integers:
OperationsTypes | uint 8x16 | int 8x16 | uint 16x8 | int 16x8 | uint 32x4 | int 32x4 |
load, store x x x x x x interleave x x x x x x expand x x x x x x expand_q x x add, sub x x x x x x add_wrap, sub_wrap x x x x mul x x x x mul_expand x x x compare x x x x x x shift x x x x dotprod x logical x x x x x x min, max x x x x x x absdiff x x x x x x reduce x x mask x x x x x x pack x x x x x x pack_u x x unpack x x x x x x extract x x x x x x cvt_flt32 x cvt_flt64 x transpose4x4 x x ================== ========= ======== ========= ======== ========= ========
Big integers:
OperationsTypes | uint 64x2 | int 64x2 |
load, store x x add, sub x x shift x x logical x x extract x x ================ ========= ========
Floating point:
OperationsTypes | float 32x4 | float 64x2 |
load, store x x interleave x add, sub x x mul x x div x x compare x x min, max x x absdiff x x reduce x mask x x unpack x x cvt_flt32 x cvt_flt64 x sqrt, abs x x float math x x transpose4x4 x ================ ========== ==========
Typedefs
typedef v_reg<float, 4> cv::v_float32x4
Four 32-bit floating point values (single precision)
typedef v_reg<double, 2> cv::v_float64x2
Two 64-bit floating point values (double precision)
typedef v_reg<short, 8> cv::v_int16x8
Eight 16-bit signed integer values.
typedef v_reg<int, 4> cv::v_int32x4
Four 32-bit signed integer values.
typedef v_reg<int64, 2> cv::v_int64x2
Two 64-bit signed integer values.
typedef v_reg<schar, 16> cv::v_int8x16
Sixteen 8-bit signed integer values.
typedef v_reg<ushort, 8> cv::v_uint16x8
Eight 16-bit unsigned integer values.
typedef v_reg<unsigned, 4> cv::v_uint32x4
Four 32-bit unsigned integer values.
typedef v_reg<uint64, 2> cv::v_uint64x2
Two 64-bit unsigned integer values.
typedef v_reg<uchar, 16> cv::v_uint8x16
Sixteen 8-bit unsigned integer values.
Global Functions
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator!=( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Not equal comparison.
For all types except 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator&( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Bitwise AND.
Only for integer types.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator*( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Multiply values.
For 16- and 32-bit integer types and floating types.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator+( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Add values.
For all types.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator-( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Subtract values.
For all types.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator::( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Divide values.
For floating types only.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator<( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Less-than comparison.
For all types except 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator<<( const v_reg<_Tp, n>& a, int imm )
Bitwise shift left.
For 16-, 32- and 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator<=( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Less-than or equal comparison.
For all types except 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator==( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Equal comparison.
For all types except 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator>( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Greater-than comparison.
For all types except 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator>=( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Greater-than or equal comparison.
For all types except 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator>>( const v_reg<_Tp, n>& a, int imm )
Bitwise shift right.
For 16-, 32- and 64-bit integer values.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator^( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Bitwise XOR.
Only for integer types.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator|( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Bitwise OR.
Only for integer types.
template < typename _Tp, int n > v_reg<_Tp, n> cv::operator~(const v_reg<_Tp, n>& a)
Bitwise NOT.
Only for integer types.
template < typename _Tp, int n > v_reg<typename V_TypeTraits<_Tp>::abs_type, n> cv::v_abs(const v_reg<_Tp, n>& a)
Absolute value of elements.
Only for floating point types.
template < typename _Tp, int n > v_reg<typename V_TypeTraits<_Tp>::abs_type, n> cv::v_absdiff( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Absolute difference.
Returns \(|a - b|\) converted to corresponding unsigned type. Example:
v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1} v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
For 8-, 16-, 32-bit integer source types.
v_float32x4 cv::v_absdiff( const v_float32x4& a, const v_float32x4& b )
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
For 32-bit floating point values
v_float64x2 cv::v_absdiff( const v_float64x2& a, const v_float64x2& b )
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
For 64-bit floating point values
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_add_wrap( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Add values without saturation.
For 8- and 16-bit integer values.
template < typename _Tp, int n > v_reg<int, n> cv::v_ceil(const v_reg<_Tp, n>& a)
Ceil elements.
Only for floating point types.
template <int n> v_reg<int, n> cv::v_ceil(const v_reg<float, n>& a)
Ceil.
Ceil each value. Input type is float vector ==> output type is int vector.
template <int n> v_reg<int, n*2> cv::v_ceil(const v_reg<double, n>& a)
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
template < typename _Tp, int n > bool cv::v_check_all(const v_reg<_Tp, n>& a)
Check if all packed values are less than zero.
Unsigned values will be casted to signed: uchar 254 => char -2
. For all types except 64-bit.
template < typename _Tp, int n > bool cv::v_check_any(const v_reg<_Tp, n>& a)
Check if any of packed values is less than zero.
Unsigned values will be casted to signed: uchar 254 => char -2
. For all types except 64-bit.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_combine_high( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Combine vector from last elements of two vectors.
Scheme:
{A1 A2 A3 A4} {B1 B2 B3 B4} --------------- {A3 A4 B3 B4}
For all types except 64-bit.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_combine_low( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Combine vector from first elements of two vectors.
Scheme:
{A1 A2 A3 A4} {B1 B2 B3 B4} --------------- {A1 A2 B1 B2}
For all types except 64-bit.
template <int n> v_reg<float, n> cv::v_cvt_f32(const v_reg<int, n>& a)
Convert to float.
Supported input type is cv::v_int32x4.
template <int n> v_reg<double, n> cv::v_cvt_f64(const v_reg<int, n*2>& a)
Convert to double.
Supported input type is cv::v_int32x4.
template <int n> v_reg<double, n> cv::v_cvt_f64(const v_reg<float, n*2>& a)
Convert to double.
Supported input type is cv::v_float32x4.
template < typename _Tp, int n > v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> cv::v_dotprod( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Dot product of elements.
Multiply values in two registers and sum adjacent result pairs. Scheme:
{A1 A2 ...} // 16-bit x {B1 B2 ...} // 16-bit ------------- {A1B1+A2B2 ...} // 32-bit
Implemented only for 16-bit signed source type (v_int16x8).
template < typename _Tp, int n > void cv::v_expand( const v_reg<_Tp, n>& a, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1 )
Expand values to the wider pack type.
Copy contents of register to two registers with 2x wider pack type. Scheme:
int32x4 int64x2 int64x2 {A B C D} ==> {A B} , {C D}
template < int s, typename _Tp, int n > v_reg<_Tp, n> cv::v_extract( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Vector extract.
Scheme:
{A1 A2 A3 A4} {B1 B2 B3 B4} ======================== shift = 1 {A2 A3 A4 B1} shift = 2 {A3 A4 B1 B2} shift = 3 {A4 B1 B2 B3}
Restriction: 0 <= shift < nlanes
Usage:
v_int32x4 a, b, c; c = v_extract<2>(a, b);
For integer types only.
template < typename _Tp, int n > v_reg<int, n> cv::v_floor(const v_reg<_Tp, n>& a)
Floor elements.
Only for floating point types.
template <int n> v_reg<int, n> cv::v_floor(const v_reg<float, n>& a)
Floor.
Floor each value. Input type is float vector ==> output type is int vector.
template <int n> v_reg<int, n*2> cv::v_floor(const v_reg<double, n>& a)
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_invsqrt(const v_reg<_Tp, n>& a)
Inversed square root.
Returns \(1/sqrt(a)\) For floating point types only.
template <typename _Tp> v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> cv::v_load(const _Tp* ptr)
Load register contents from memory.
Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
Parameters:
ptr | pointer to memory block with data |
Returns:
register object
template <typename _Tp> v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> cv::v_load_aligned(const _Tp* ptr)
Load register contents from memory (aligned)
similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary)
template < typename _Tp, int n > void cv::v_load_deinterleave( const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b )
Load and deinterleave (2 channels)
Load data from memory deinterleave and store to 2 registers. Scheme:
{A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
For all types except 64-bit.
template < typename _Tp, int n > void cv::v_load_deinterleave( const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b, v_reg<_Tp, n>& c )
Load and deinterleave (3 channels)
Load data from memory deinterleave and store to 3 registers. Scheme:
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
For all types except 64-bit.
template < typename _Tp, int n > void cv::v_load_deinterleave( const _Tp* ptr, v_reg<_Tp, n>& a, v_reg<_Tp, n>& b, v_reg<_Tp, n>& c, v_reg<_Tp, n>& d )
Load and deinterleave (4 channels)
Load data from memory deinterleave and store to 4 registers. Scheme:
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
For all types except 64-bit.
template <typename _Tp> v_reg<typename V_TypeTraits<_Tp>::w_type, V_SIMD128Traits<_Tp>::nlanes/2> cv::v_load_expand(const _Tp* ptr)
Load register contents from memory with double expand.
Same as cv::v_load, but result pack type will be 2x wider than memory type.
short buf[4] = {1, 2, 3, 4}; // type is int16 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
For 8-, 16-, 32-bit integer source types.
template <typename _Tp> v_reg<typename V_TypeTraits<_Tp>::q_type, V_SIMD128Traits<_Tp>::nlanes/4> cv::v_load_expand_q(const _Tp* ptr)
Load register contents from memory with quad expand.
Same as cv::v_load_expand, but result type is 4 times wider than source.
char buf[4] = {1, 2, 3, 4}; // type is int8 v_int32x4 r = v_load_q(buf); // r = {1, 2, 3, 4} - type is int32
For 8-bit integer source types.
template <typename _Tp> v_reg<_Tp, V_SIMD128Traits<_Tp>::nlanes> cv::v_load_halves( const _Tp* loptr, const _Tp* hiptr )
Load register contents from two memory blocks.
int lo[2] = { 1, 2 }, hi[2] = { 3, 4 }; v_int32x4 r = v_load_halves(lo, hi);
Parameters:
loptr | memory block containing data for first half (0..n/2) |
hiptr | memory block containing data for second half (n/2..n) |
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_magnitude( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Magnitude.
Returns \(sqrt(a^2 + b^2)\) For floating point types only.
v_float32x4 cv::v_matmul( const v_float32x4& v, const v_float32x4& m0, const v_float32x4& m1, const v_float32x4& m2, const v_float32x4& m3 )
Matrix multiplication.
Scheme:
{A0 A1 A2 A3} |V0| {B0 B1 B2 B3} |V1| {C0 C1 C2 C3} |V2| {D0 D1 D2 D3} x |V3| ==================== {R0 R1 R2 R3}, where: R0 = A0V0 + A1V1 + A2V2 + A3V3, R1 = B0V0 + B1V1 + B2V2 + B3V3 ...
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_max( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Choose max values for each pair.
Scheme:
{A1 A2 ...} {B1 B2 ...} -------------- {max(A1,B1) max(A2,B2) ...}
For all types except 64-bit integer.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_min( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Choose min values for each pair.
Scheme:
{A1 A2 ...} {B1 B2 ...} -------------- {min(A1,B1) min(A2,B2) ...}
For all types except 64-bit integer.
template < typename _Tp, int n > void cv::v_mul_expand( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c, v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d )
Multiply and expand.
Multiply values two registers and store results in two registers with wider pack type. Scheme:
{A B C D} // 32-bit x {E F G H} // 32-bit --------------- {AE BF} // 64-bit {CG DH} // 64-bit
Example:
v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2} v_uint64x2 c, d; // results v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_muladd( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c )
Multiply and add.
Returns \(a*b + c\) For floating point types only.
template < typename _Tp, int n > v_uint32x4 cv::v_popcount(const v_reg<_Tp, n>& a)
Count the 1 bits in the vector and return 4 values.
Scheme:
{A1 A2 A3 ...} => popcount(A1)
Any types but result will be in v_uint32x4
template < typename _Tp, int n > void cv::v_recombine( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, v_reg<_Tp, n>& low, v_reg<_Tp, n>& high )
Combine two vectors from lower and higher parts of two other vectors.
low = cv::v_combine_low(a, b); high = cv::v_combine_high(a, b);
template < typename _Tp, int n > _Tp cv::v_reduce_max(const v_reg<_Tp, n>& a)
Find one max value.
Scheme:
{A1 A2 A3 ...} => max(A1,A2,A3,...)
For 32-bit integer and 32-bit floating point types.
template < typename _Tp, int n > _Tp cv::v_reduce_min(const v_reg<_Tp, n>& a)
Find one min value.
Scheme:
{A1 A2 A3 ...} => min(A1,A2,A3,...)
For 32-bit integer and 32-bit floating point types.
template < typename _Tp, int n > V_TypeTraits<_Tp>::sum_type cv::v_reduce_sum(const v_reg<_Tp, n>& a)
Sum packed values.
Scheme:
{A1 A2 A3 ...} => sum{A1,A2,A3,...}
For 32-bit integer and 32-bit floating point types.
v_float32x4 cv::v_reduce_sum4( const v_float32x4& a, const v_float32x4& b, const v_float32x4& c, const v_float32x4& d )
Sums all elements of each input vector, returns the vector of sums.
Scheme:
result[0] = a[0] + a[1] + a[2] + a[3] result[1] = b[0] + b[1] + b[2] + b[3] result[2] = c[0] + c[1] + c[2] + c[3] result[3] = d[0] + d[1] + d[2] + d[3]
template < typename _Tp, int n > v_reg<int, n> cv::v_round(const v_reg<_Tp, n>& a)
Round elements.
Only for floating point types.
template <int n> v_reg<int, n> cv::v_round(const v_reg<float, n>& a)
Round.
Rounds each value. Input type is float vector ==> output type is int vector.
template <int n> v_reg<int, n*2> cv::v_round(const v_reg<double, n>& a)
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_select( const v_reg<_Tp, n>& mask, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Bitwise select.
Return value will be built by combining values a and b using the following scheme: If the i-th bit in mask is 1 select i-th bit from a else select i-th bit from b
template < typename _Tp, int n > int cv::v_signmask(const v_reg<_Tp, n>& a)
Get negative values mask.
Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. Example:
v_int32x4 r; // set to {-1, -1, 1, 1} int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
For all types except 64-bit.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_sqr_magnitude( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Square of the magnitude.
Returns \(a^2 + b^2\) For floating point types only.
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_sqrt(const v_reg<_Tp, n>& a)
Square root of elements.
Only for floating point types.
template < typename _Tp, int n > void cv::v_store( _Tp* ptr, const v_reg<_Tp, n>& a )
Store data to memory.
Store register contents to memory. Scheme:
REG {A B C D} ==> MEM {A B C D}
Pointer can be unaligned.
template < typename _Tp, int n > void cv::v_store_aligned( _Tp* ptr, const v_reg<_Tp, n>& a )
Store data to memory (aligned)
Store register contents to memory. Scheme:
REG {A B C D} ==> MEM {A B C D}
Pointer should be aligned by 16-byte boundary.
template < typename _Tp, int n > void cv::v_store_high( _Tp* ptr, const v_reg<_Tp, n>& a )
Store data to memory (higher half)
Store higher half of register contents to memory. Scheme:
REG {A B C D} ==> MEM {C D}
template < typename _Tp, int n > void cv::v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Interleave and store (2 channels)
Interleave and store data from 2 registers to memory. Scheme:
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
For all types except 64-bit.
template < typename _Tp, int n > void cv::v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c )
Interleave and store (3 channels)
Interleave and store data from 3 registers to memory. Scheme:
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
For all types except 64-bit.
template < typename _Tp, int n > void cv::v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c, const v_reg<_Tp, n>& d )
Interleave and store (4 channels)
Interleave and store data from 4 registers to memory. Scheme:
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
For all types except 64-bit.
template < typename _Tp, int n > void cv::v_store_low( _Tp* ptr, const v_reg<_Tp, n>& a )
Store data to memory (lower half)
Store lower half of register contents to memory. Scheme:
REG {A B C D} ==> MEM {A B}
template < typename _Tp, int n > v_reg<_Tp, n> cv::v_sub_wrap( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b )
Subtract values without saturation.
For 8- and 16-bit integer values.
template <typename _Tp> void cv::v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1, const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3, v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1, v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
Transpose 4x4 matrix.
Scheme:
a0 {A1 A2 A3 A4} a1 {B1 B2 B3 B4} a2 {C1 C2 C3 C4} a3 {D1 D2 D3 D4} =============== b0 {A1 B1 C1 D1} b1 {A2 B2 C2 D2} b2 {A3 B3 C3 D3} b3 {A4 B4 C4 D4}
template < typename _Tp, int n > v_reg<int, n> cv::v_trunc(const v_reg<_Tp, n>& a)
Truncate elements.
Only for floating point types.
template <int n> v_reg<int, n> cv::v_trunc(const v_reg<float, n>& a)
Trunc.
Truncate each value. Input type is float vector ==> output type is int vector.
template <int n> v_reg<int, n*2> cv::v_trunc(const v_reg<double, n>& a)
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
template < typename _Tp, int n > void cv::v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1, v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
Interleave two vectors.
Scheme:
{A1 A2 A3 A4} {B1 B2 B3 B4} --------------- {A1 B1 A2 B2} and {A3 B3 A4 B4}
For all types except 64-bit.
Macros
#define CV_SIMD128
Set to 1 if current compiler supports vector extensions (NEON or SSE is enabled)
#define CV_SIMD128_64F
Set to 1 if current intrinsics implementation supports 64-bit float vectors.