30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
37 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40 #include <emmintrin.h>
61 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
64 reg.nsLoad1ValHighAndLow(ptr);
82 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
85 reg.nsLoad2ValAligned(ptr);
89 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
103 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
113 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
120 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
127 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
134 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
141 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
148 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
152 inline void nsLoad1ValHighAndLow(
const double* ptr)
154 xmm = _mm_load1_pd(ptr);
157 inline void nsLoad2Val(
const double* ptr)
159 xmm = _mm_loadu_pd(ptr);
162 inline void nsLoad2ValAligned(
const double* pval)
164 xmm = _mm_load_pd(pval);
167 inline void nsLoad2Val(
const float* pval)
169 __m128 temp1 = _mm_load_ss(pval);
170 __m128 temp2 = _mm_load_ss(pval + 1);
171 temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
172 temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
173 xmm = _mm_cvtps_pd(temp1);
176 inline void nsLoad2Val(
const unsigned char* ptr)
178 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
181 __m128i xmm_i = _mm_cvtsi32_si128(s);
183 __m128i xmm_i = _mm_cvtsi32_si128(*(
unsigned short*)(ptr));
185 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
186 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
187 xmm = _mm_cvtepi32_pd(xmm_i);
190 inline void nsLoad2Val(
const short* ptr)
194 __m128i xmm_i = _mm_cvtsi32_si128(i);
195 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
196 xmm_i = _mm_srai_epi32(xmm_i, 16);
197 xmm = _mm_cvtepi32_pd(xmm_i);
200 inline void nsLoad2Val(
const unsigned short* ptr)
204 __m128i xmm_i = _mm_cvtsi32_si128(i);
205 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
206 xmm_i = _mm_srli_epi32(xmm_i, 16);
207 xmm = _mm_cvtepi32_pd(xmm_i);
212 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
215 __m128i xmm_i = _mm_cvtsi32_si128(i);
217 __m128i xmm_i = _mm_cvtsi32_si128(*(
int*)(ptr));
219 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
220 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
221 low.xmm = _mm_cvtepi32_pd(xmm_i);
222 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
228 high.nsLoad2Val(ptr+2);
234 high.nsLoad2Val(ptr+2);
240 high.nsLoad2Val(ptr+2);
245 __m128 temp1 = _mm_loadu_ps(ptr);
246 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
247 low.xmm = _mm_cvtps_pd(temp1);
248 high.xmm = _mm_cvtps_pd(temp2);
251 inline void Zeroize()
253 xmm = _mm_setzero_pd();
264 xmm = _mm_add_pd(xmm, other.xmm);
270 xmm = _mm_mul_pd(xmm, other.xmm);
277 ret.xmm = _mm_add_pd(xmm, other.xmm);
284 ret.xmm = _mm_sub_pd(xmm, other.xmm);
291 ret.xmm = _mm_mul_pd(xmm, other.xmm);
298 ret.xmm = _mm_div_pd(xmm, other.xmm);
302 inline void AddLowAndHigh()
305 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
306 xmm = _mm_add_pd(xmm, xmm2);
309 inline void Store2Double(
double* pval)
const
311 _mm_storeu_pd(pval, xmm);
314 inline void Store2DoubleAligned(
double* pval)
const
316 _mm_store_pd(pval, xmm);
319 void Store2Val(
unsigned short* ptr)
const
321 __m128i tmp = _mm_cvtpd_epi32(xmm);
322 ptr[0] = (GUInt16)_mm_extract_epi16(tmp, 0);
323 ptr[1] = (GUInt16)_mm_extract_epi16(tmp, 2);
326 inline operator double ()
const
329 _mm_store_sd(&val, xmm);
336 #warning "Software emulation of SSE2 !"
355 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
358 reg.nsLoad1ValHighAndLow(ptr);
366 if (expr1.low == expr2.low)
367 memset(&(reg.low), 0xFF,
sizeof(
double));
371 if (expr1.high == expr2.high)
372 memset(&(reg.high), 0xFF,
sizeof(
double));
383 if (expr1.low != expr2.low)
384 memset(&(reg.low), 0xFF,
sizeof(
double));
388 if (expr1.high != expr2.high)
389 memset(&(reg.high), 0xFF,
sizeof(
double));
400 if (expr1.low > expr2.low)
401 memset(&(reg.low), 0xFF,
sizeof(
double));
405 if (expr1.high > expr2.high)
406 memset(&(reg.high), 0xFF,
sizeof(
double));
416 int low1[2], high1[2];
417 int low2[2], high2[2];
418 memcpy(low1, &expr1.low,
sizeof(
double));
419 memcpy(high1, &expr1.high,
sizeof(
double));
420 memcpy(low2, &expr2.low,
sizeof(
double));
421 memcpy(high2, &expr2.high,
sizeof(
double));
424 high1[0] &= high2[0];
425 high1[1] &= high2[1];
426 memcpy(®.low, low1,
sizeof(
double));
427 memcpy(®.high, high1,
sizeof(
double));
435 reg.low = true_expr.low;
437 reg.low = false_expr.low;
439 reg.high = true_expr.high;
441 reg.high = false_expr.high;
448 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.high;
449 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.low;
460 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
463 reg.nsLoad2ValAligned(ptr);
474 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
488 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
495 inline void nsLoad1ValHighAndLow(
const double* pval)
501 inline void nsLoad2Val(
const double* pval)
507 inline void nsLoad2ValAligned(
const double* pval)
513 inline void nsLoad2Val(
const float* pval)
519 inline void nsLoad2Val(
const unsigned char* ptr)
525 inline void nsLoad2Val(
const short* ptr)
531 inline void nsLoad2Val(
const unsigned short* ptr)
548 high.nsLoad2Val(ptr+2);
554 high.nsLoad2Val(ptr+2);
560 high.nsLoad2Val(ptr+2);
566 high.nsLoad2Val(ptr+2);
569 inline void Zeroize()
599 ret.low = low + other.low;
600 ret.high = high + other.high;
607 ret.low = low - other.low;
608 ret.high = high - other.high;
615 ret.low = low * other.low;
616 ret.high = high * other.high;
623 ret.low = low / other.low;
624 ret.high = high / other.high;
628 inline void AddLowAndHigh()
630 double add = low + high;
635 inline void Store2Double(
double* pval)
const
641 inline void Store2DoubleAligned(
double* pval)
const
647 void Store2Val(
unsigned short* ptr)
const
649 ptr[0] = (GUInt16)low;
650 ptr[1] = (GUInt16)high;
653 inline operator double ()
const
677 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
680 reg.low.nsLoad1ValHighAndLow(ptr);
685 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
688 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
695 reg.low.nsLoad2Val(ptr);
696 reg.high.nsLoad2Val(ptr+2);
700 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
703 reg.low.nsLoad2Val(ptr);
704 reg.high.nsLoad2Val(ptr+2);
711 reg.low.nsLoad2Val(ptr);
712 reg.high.nsLoad2Val(ptr+2);
716 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
719 reg.low.nsLoad2ValAligned(ptr);
720 reg.high.nsLoad2ValAligned(ptr+2);
727 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
734 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
735 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
742 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
743 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
750 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
751 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
758 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
759 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
766 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
767 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
774 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
775 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
803 ret.low = low + other.low;
804 ret.high = high + other.high;
811 ret.low = low - other.low;
812 ret.high = high - other.high;
819 ret.low = low * other.low;
820 ret.high = high * other.high;
827 ret.low = low / other.low;
828 ret.high = high / other.high;
832 inline void AddLowAndHigh()
848 void Store4Val(
unsigned short* ptr)
const
851 high.Store2Val(ptr+2);
Core portability definitions for CPL.
Definition: gdalsse_priv.h:338
Definition: gdalsse_priv.h:661