Actual source code: iclsse.h
2: #ifndef __ICL_SSE_H_
4: #include <xmmintrin.h>
8: /* SSE_FUNCTION_BEGIN must be after the LAST declaration in the outermost SSE scope */
9: #define SSE_SCOPE_BEGIN { __m128 XMM0,XMM1,XMM2,XMM3,XMM4,XMM5,XMM6,XMM7; {
10: #define SSE_SCOPE_END }}
12: /* For use with SSE Inlined Assembly Blocks */
13: /* Note: SSE_ macro invokations must NOT be followed by a ; */
15: #define SSE_INLINE_BEGIN_1(arg1) { float *_tmp_arg1; _tmp_arg1=arg1;
16: #define SSE_INLINE_END_1 }
17: #define SSE_INLINE_BEGIN_2(arg1,arg2) { float *_tmp_arg1, *_tmp_arg2; _tmp_arg1=arg1; _tmp_arg2=arg2;
18: #define SSE_INLINE_END_2 }
19: #define SSE_INLINE_BEGIN_3(arg1,arg2,arg3) { float *_tmp_arg1, *_tmp_arg2, *_tmp_arg3; \
20: _tmp_arg1=arg1; _tmp_arg2=arg2; _tmp_arg3=arg3;
21: #define SSE_INLINE_END_3 }
23: #define SSE_ARG_1 _tmp_arg1
24: #define SSE_ARG_2 _tmp_arg2
25: #define SSE_ARG_3 _tmp_arg3
26: /* Note: If more args are to be used, be sure the debug version uses the most args allowed */
28: /* Offset values for SSE_ load/store/arithmetic memory ops */
29: #define FLOAT_0 0
30: #define FLOAT_1 1
31: #define FLOAT_2 2
32: #define FLOAT_3 3
33: #define FLOAT_4 4
34: #define FLOAT_5 5
35: #define FLOAT_6 6
36: #define FLOAT_7 7
37: #define FLOAT_8 8
38: #define FLOAT_9 9
39: #define FLOAT_10 10
40: #define FLOAT_11 11
41: #define FLOAT_12 12
42: #define FLOAT_13 13
43: #define FLOAT_14 14
44: #define FLOAT_15 15
46: #define FLOAT_16 16
47: #define FLOAT_24 24
48: #define FLOAT_32 32
49: #define FLOAT_40 40
50: #define FLOAT_48 48
51: #define FLOAT_56 56
52: #define FLOAT_64 64
54: #define DOUBLE_0 0
55: #define DOUBLE_1 1
56: #define DOUBLE_2 2
57: #define DOUBLE_3 3
58: #define DOUBLE_4 4
59: #define DOUBLE_5 5
60: #define DOUBLE_6 6
61: #define DOUBLE_7 7
63: #define DOUBLE_8 8
64: #define DOUBLE_16 16
65: #define DOUBLE_20 20
66: #define DOUBLE_24 24
67: #define DOUBLE_28 28
68: #define DOUBLE_32 32
70: /* xmmintrin.h provides for inline/debug versions automatically */
71: /* Inline versions */
73: /* Prefetch Macros */
74: #define SSE_PREFETCH_NTA(arg,offset) PREFETCH_NTA(&arg[offset]);
75: #define SSE_PREFETCH_L1(arg,offset) PREFETCH_L1(&arg[offset]);
76: #define SSE_PREFETCH_L2(arg,offset) PREFETCH_L2(&arg[offset]);
77: #define SSE_PREFETCH_L3(arg,offset) PREFETCH_L3(&arg[offset]);
79: /* Store Macros */
80: #define SSE_STORE_SS(arg,offset,srcreg) STORE_SS(&arg[offset],srcreg);
81: #define SSE_STOREL_PS(arg,offset,srcreg) STOREL_PS(&arg[offset],srcreg);
82: #define SSE_STOREH_PS(arg,offset,srcreg) STOREH_PS(&arg[offset],srcreg);
83: #define SSE_STORE_PS(arg,offset,srcreg) STORE_PS(&arg[offset],srcreg);
84: #define SSE_STOREU_PS(arg,offset,srcreg) STOREU_PS(&arg[offset],srcreg);
85: #define SSE_STREAM_PS(arg,offset,srcreg) STREAM_PS(&arg[offset],srcreg);
87: /* Register-Register Copy Macros */
88: #define SSE_COPY_SS(dstreg,srcreg) COPY_SS(dstreg,srcreg);
89: #define SSE_COPY_PS(dstreg,srcreg) COPY_PS(dstreg,srcreg);
91: /* Load Macros */
92: #define SSE_LOAD_SS(arg,offset,dstreg) LOAD_SS(&arg[offset],dstreg);
93: #define SSE_LOADL_PS(arg,offset,dstreg) LOADL_PS(&arg[offset],dstreg);
94: #define SSE_LOADH_PS(arg,offset,dstreg) LOADH_PS(&arg[offset],dstreg);
95: #define SSE_LOAD_PS(arg,offset,dstreg) LOAD_PS(&arg[offset],dstreg);
96: #define SSE_LOADU_PS(arg,offset,dstreg) LOADU_PS(&arg[offset],dstreg);
98: /* Shuffle */
99: #define SSE_SHUFFLE(dstreg,srcreg,imm) SHUFFLE(dstreg,srcreg,imm);
101: /* Multiply: A:=A*B */
102: #define SSE_MULT_SS(dstreg,srcreg) MULT_SS(dstreg,srcreg);
103: #define SSE_MULT_PS(dstreg,srcreg) MULT_PS(dstreg,srcreg);
104: #define SSE_MULT_SS_M(dstreg,arg,offset) MULT_SS_M(dstreg,&arg[offset]);
105: #define SSE_MULT_PS_M(dstreg,arg,offset) MULT_PS_M(dstreg,&arg[offset]);
107: /* Divide: A:=A/B */
108: #define SSE_DIV_SS(dstreg,srcreg) DIV_SS(dstreg,srcreg);
109: #define SSE_DIV_PS(dstreg,srcreg) DIV_PS(dstreg,srcreg);
110: #define SSE_DIV_SS_M(dstreg,arg,offset) DIV_SS_M(dstreg,&arg[offset]);
111: #define SSE_DIV_PS_M(dstreg,arg,offset) DIV_PS_M(dstreg,&arg[offset]);
113: /* Reciprocal: A:=1/B */
114: #define SSE_RECIP_SS(dstreg,srcreg) RECIP_SS(dstreg,srcreg);
115: #define SSE_RECIP_PS(dstreg,srcreg) RECIP_PS(dstreg,srcreg);
116: #define SSE_RECIP_SS_M(dstreg,arg,offset) RECIP_SS_M(dstreg,&arg[offset]);
117: #define SSE_RECIP_PS_M(dstreg,arg,offset) RECIP_PS_M(dstreg,&arg[offset]);
119: /* Add: A:=A+B */
120: #define SSE_ADD_SS(dstreg,srcreg) ADD_SS(dstreg,srcreg);
121: #define SSE_ADD_PS(dstreg,srcreg) ADD_PS(dstreg,srcreg);
122: #define SSE_ADD_SS_M(dstreg,arg,offset) ADD_SS_M(dstreg,&arg[offset]);
123: #define SSE_ADD_PS_M(dstreg,arg,offset) ADD_PS_M(dstreg,&arg[offset]);
125: /* Subtract: A:=A-B */
126: #define SSE_SUB_SS(dstreg,srcreg) SUB_SS(dstreg,srcreg);
127: #define SSE_SUB_PS(dstreg,srcreg) SUB_PS(dstreg,srcreg);
128: #define SSE_SUB_SS_M(dstreg,arg,offset) SUB_SS_M(dstreg,&arg[offset]);
129: #define SSE_SUB_PS_M(dstreg,arg,offset) SUB_PS_M(dstreg,&arg[offset]);
131: /* Logical: A:=A<op>B */
132: #define SSE_AND_SS(dstreg,srcreg) AND_SS(dstreg,srcreg);
133: #define SSE_ANDNOT_SS(dstreg,srcreg) ANDNOT_SS(dstreg,srcreg);
134: #define SSE_OR_SS(dstreg,srcreg) OR_SS(dstreg,srcreg);
135: #define SSE_XOR_SS(dstreg,srcreg) XOR_SS(dstreg,srcreg);
137: #define SSE_AND_PS(dstreg,srcreg) AND_PS(dstreg,srcreg);
138: #define SSE_ANDNOT_PS(dstreg,srcreg) ANDNOT_PS(dstreg,srcreg);
139: #define SSE_OR_PS(dstreg,srcreg) OR_PS(dstreg,srcreg);
140: #define SSE_XOR_PS(dstreg,srcreg) XOR_PS(dstreg,srcreg);
142: /* Comparisons A:=A<compare>B */
143: #define SSE_CMPEQ_SS(dstreg,srcreg) CMPEQ_SS(dstreg,srcreg);
144: #define SSE_CMPLT_SS(dstreg,srcreg) CMPLT_SS(dstreg,srcreg);
145: #define SSE_CMPLE_SS(dstreg,srcreg) CMPLE_SS(dstreg,srcreg);
146: #define SSE_CMPUNORD_SS(dstreg,srcreg) CMPUNORD_SS(dstreg,srcreg);
147: #define SSE_CMPNEQ_SS(dstreg,srcreg) CMPNEQ_SS(dstreg,srcreg);
148: #define SSE_CMPNLT_SS(dstreg,srcreg) CMPNLT_SS(dstreg,srcreg);
149: #define SSE_CMPNLE_SS(dstreg,srcreg) CMPNLE_SS(dstreg,srcreg);
150: #define SSE_CMPORD_SS(dstreg,srcreg) CMPORD_SS(dstreg,srcreg);
152: #define SSE_CMPEQ_PS(dstreg,srcreg) CMPEQ_PS(dstreg,srcreg);
153: #define SSE_CMPLT_PS(dstreg,srcreg) CMPLT_PS(dstreg,srcreg);
154: #define SSE_CMPLE_PS(dstreg,srcreg) CMPLE_PS(dstreg,srcreg);
155: #define SSE_CMPUNORD_PS(dstreg,srcreg) CMPUNORD_PS(dstreg,srcreg);
156: #define SSE_CMPNEQ_PS(dstreg,srcreg) CMPNEQ_PS(dstreg,srcreg);
157: #define SSE_CMPNLT_PS(dstreg,srcreg) CMPNLT_PS(dstreg,srcreg);
158: #define SSE_CMPNLE_PS(dstreg,srcreg) CMPNLE_PS(dstreg,srcreg);
159: #define SSE_CMPORD_PS(dstreg,srcreg) CMPORD_PS(dstreg,srcreg);
161: /* ================================================================================================ */
163: /* Other useful macros whose destinations are not SSE registers */
165: /* Movemask (for use after comparisons) */
166: /* Reduces 128 bit mask to an integer based on most significant bits of 32 bit parts. */
167: #define MOVEMASK(integ,srcxmmreg) integ = _mm_movemask_ps(srcxmmreg)
169: /* Double_4/Float_4 Conversions */
170: #define CONVERT_FLOAT4_DOUBLE4(dst,src) { double *_tmp_double_ptr; float *_tmp_float_ptr; \
171: _tmp_double_ptr = dst; _tmp_float_ptr = src; \
172: _tmp_double_ptr[0]=(double)_tmp_float_ptr[0]; \
173: _tmp_double_ptr[1]=(double)_tmp_float_ptr[1]; \
174: _tmp_double_ptr[2]=(double)_tmp_float_ptr[2]; \
175: _tmp_double_ptr[3]=(double)_tmp_float_ptr[3]; }
177: #define CONVERT_DOUBLE4_FLOAT4(dst,src) { double *_tmp_double_ptr; float *_tmp_float_ptr; \
178: _tmp_double_ptr = src; _tmp_float_ptr = dst; \
179: _tmp_float_ptr[0]=(float)_tmp_double_ptr[0]; \
180: _tmp_float_ptr[1]=(float)_tmp_double_ptr[1]; \
181: _tmp_float_ptr[2]=(float)_tmp_double_ptr[2]; \
182: _tmp_float_ptr[3]=(float)_tmp_double_ptr[3]; }
184: /* Aligned Malloc */
185: #define SSE_MALLOC(var,sze) { void *_tmp_void_ptr = *var; size_t _tmp_size; _tmp_size = sze; \
186: *var = _mm_malloc(sze,16); }
187: #define SSE_FREE(var) { void *_tmp_void_ptr = var; \
188: _mm_free(var); }
190: /* CPUID Instruction Macros */
192: #define CPUID_VENDOR 0
193: #define CPUID_FEATURES 1
194: #define CPUID_CACHE 2
196: #define CPUID(imm,_eax,_ebx,_ecx,_edx) { int _tmp_imm; \
197: unsigned long _tmp_eax, _tmp_ebx, _tmp_ecx, _tmp_edx; \
198: _tmp_eax=*_eax; _tmp_ebx=*_ebx; _tmp_ecx=*_ecx; _tmp_edx=*_edx; \
199: _tmp_imm=imm; \
200: __asm { \
201: __asm mov eax, imm \
202: __asm cpuid \
203: __asm mov _tmp_eax, eax \
204: __asm mov _tmp_ebx, ebx \
205: __asm mov _tmp_ecx, ecx \
206: __asm mov _tmp_edx, edx \
207: } \
208: *_eax=_tmp_eax; *_ebx=_tmp_ebx; *_ecx=_tmp_ecx; *_edx=_tmp_edx; \
209: }
211: #define CPUID_GET_VENDOR(result) { char *_gv_vendor=result; int _gv_i; \
212: unsigned long _gv_eax=0;unsigned long _gv_ebx=0;unsigned long _gv_ecx=0;unsigned long _gv_edx=0;\
213: CPUID(CPUID_VENDOR,&_gv_eax,&_gv_ebx,&_gv_ecx,&_gv_edx); \
214: for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+0]=*(((char *)(&_gv_ebx))+_gv_i); \
215: for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+4]=*(((char *)(&_gv_edx))+_gv_i); \
216: for (_gv_i=0;_gv_i<4;_gv_i++) _gv_vendor[_gv_i+8]=*(((char *)(&_gv_ecx))+_gv_i); \
217: }
219: /* ================================================================================================ */
221: /* The Stand Alone Versions of the SSE Macros */
223: /* Prefetch Macros */
224: #define PREFETCH_NTA(var) _mm_prefetch((char *)(var),_MM_HINT_NTA)
225: #define PREFETCH_L1(var) _mm_prefetch((char *)(var),_MM_HINT_T0)
226: #define PREFETCH_L2(var) _mm_prefetch((char *)(var),_MM_HINT_T1)
227: #define PREFETCH_L3(var) _mm_prefetch((char *)(var),_MM_HINT_T2)
229: /* Store Macros */
230: #define STORE_SS(var,srcreg) _mm_store_ss(var,srcreg)
231: #define STOREL_PS(var,srcreg) _mm_storel_pi((__m64 *)(var),srcreg)
232: #define STOREH_PS(var,srcreg) _mm_storeh_pi((__m64 *)(var),srcreg)
233: #define STORE_PS(var,srcreg) _mm_store_ps(var,srcreg)
234: #define STOREU_PS(var,srcreg) _mm_storeu_ps(var,srcreg)
235: #define STREAM_PS(var,srcreg) _mm_stream_ps(var,srcreg)
237: /* Register-Register Copy Macros */
238: #define COPY_SS(dstreg,srcreg) dstreg = _mm_move_ss(dstreg,srcreg)
239: #define COPY_PS(dstreg,srcreg) dstreg = srcreg
241: /* Load Macros */
242: #define LOAD_SS(var,dstreg) dstreg = _mm_load_ss(var)
243: #define LOADL_PS(var,dstreg) dstreg = _mm_loadl_pi(dstreg,(__m64 *)(var))
244: #define LOADH_PS(var,dstreg) dstreg = _mm_loadh_pi(dstreg,(__m64 *)(var))
245: #define LOAD_PS(var,dstreg) dstreg = _mm_load_ps(var)
246: #define LOADU_PS(var,dstreg) dstreg = _mm_loadu_ps(var)
248: /* Shuffle */
249: #define SHUFFLE(dstreg,srcreg,i) dstreg = _mm_shuffle_ps(dstreg,srcreg,i)
251: /* Multiply: A:=A*B */
252: #define MULT_SS(dstreg,srcreg) dstreg = _mm_mul_ss(dstreg,srcreg)
253: #define MULT_PS(dstreg,srcreg) dstreg = _mm_mul_ps(dstreg,srcreg)
254: #define MULT_SS_M(dstreg,var) dstreg = _mm_mul_ss(dstreg,_mm_load_ss(var))
255: #define MULT_PS_M(dstreg,var) dstreg = _mm_mul_ps(dstreg,_mm_load_ps(var))
257: /* Divide: A:=A/B */
258: #define DIV_SS(dstreg,srcreg) dstreg = _mm_div_ss(dstreg,srcreg)
259: #define DIV_PS(dstreg,srcreg) dstreg = _mm_div_ps(dstreg,srcreg)
260: #define DIV_SS_M(dstreg,var) dstreg = _mm_div_ss(dstreg,_mm_load_ss(var))
261: #define DIV_PS_M(dstreg,var) dstreg = _mm_div_ps(dstreg,_mm_load_ps(var))
263: /* Reciprocal: A:=1/B */
264: #define RECIP_SS(dstreg,srcreg) dstreg = _mm_rcp_ss(srcreg)
265: #define RECIP_PS(dstreg,srcreg) dstreg = _mm_rcp_ps(srcreg)
266: #define RECIP_SS_M(dstreg,var) dstreg = _mm_rcp_ss(_mm_load_ss(var))
267: #define RECIP_PS_M(dstreg,var) dstreg = _mm_rcp_ps(_mm_load_ps(var))
269: /* Add: A:=A+B */
270: #define ADD_SS(dstreg,srcreg) dstreg = _mm_add_ss(dstreg,srcreg)
271: #define ADD_PS(dstreg,srcreg) dstreg = _mm_add_ps(dstreg,srcreg)
272: #define ADD_SS_M(dstreg,var) dstreg = _mm_add_ss(dstreg,_mm_load_ss(var))
273: #define ADD_PS_M(dstreg,var) dstreg = _mm_add_ps(dstreg,_mm_load_ps(var))
275: /* Subtract: A:=A-B */
276: #define SUB_SS(dstreg,srcreg) dstreg = _mm_sub_ss(dstreg,srcreg)
277: #define SUB_PS(dstreg,srcreg) dstreg = _mm_sub_ps(dstreg,srcreg)
278: #define SUB_SS_M(dstreg,var) dstreg = _mm_sub_ss(dstreg,_mm_load_ss(var))
279: #define SUB_PS_M(dstreg,var) dstreg = _mm_sub_ps(dstreg,_mm_load_ps(var))
281: /* Logical: A:=A<op>B */
282: #define AND_SS(dstreg,srcreg) dstreg = _mm_and_ss(dstreg,srcreg)
283: #define ANDNOT_SS(dstreg,srcreg) dstreg = _mm_andnot_ss(dstreg,srcreg)
284: #define OR_SS(dstreg,srcreg) dstreg = _mm_or_ss(dstreg,srcreg)
285: #define XOR_SS(dstreg,srcreg) dstreg = _mm_xor_ss(dstreg,srcreg)
287: #define AND_PS(dstreg,srcreg) dstreg = _mm_and_ps(dstreg,srcreg)
288: #define ANDNOT_PS(dstreg,srcreg) dstreg = _mm_andnot_ps(dstreg,srcreg)
289: #define OR_PS(dstreg,srcreg) dstreg = _mm_or_ps(dstreg,srcreg)
290: #define XOR_PS(dstreg,srcreg) dstreg = _mm_xor_ps(dstreg,srcreg)
292: /* Implementing an if():
293: First perform the comparison, then use Movemask to get an integer, say i, then
294: if(i) ....
295: */
297: /*
298: Note: From the IA Software Developer's Manual:
299: The greater-than relations not implemented in hardware require more than one instruction to
300: emulate in software and therefore should not be implemented as pseudo-ops. (For these, the
301: programmer should reverse the operands of the corresponding less than relations and use move
302: instructions to ensure that the mask is moved to the correct destination register and that the
303: source operand is left intact.)
304: */
306: /* Comparisons A:=A<compare>B */
307: #define CMPEQ_SS(dstreg,srcreg) dstreg = _mm_cmpeq_ss(dstreg,srcreg)
308: #define CMPLT_SS(dstreg,srcreg) dstreg = _mm_cmplt_ss(dstreg,srcreg)
309: #define CMPLE_SS(dstreg,srcreg) dstreg = _mm_cmple_ss(dstreg,srcreg)
310: #define CMPUNORD_SS(dstreg,srcreg) dstreg = _mm_cmpunord_ss(dstreg,srcreg)
311: #define CMPNEQ_SS(dstreg,srcreg) dstreg = _mm_cmpneq_ss(dstreg,srcreg)
312: #define CMPNLT_SS(dstreg,srcreg) dstreg = _mm_cmpnlt_ss(dstreg,srcreg)
313: #define CMPNLE_SS(dstreg,srcreg) dstreg = _mm_cmpnle_ss(dstreg,srcreg)
314: #define CMPORD_SS(dstreg,srcreg) dstreg = _mm_cmpord_ss(dstreg,srcreg)
316: #define CMPEQ_PS(dstreg,srcreg) dstreg = _mm_cmpeq_ps(dstreg,srcreg)
317: #define CMPLT_PS(dstreg,srcreg) dstreg = _mm_cmplt_ps(dstreg,srcreg)
318: #define CMPLE_PS(dstreg,srcreg) dstreg = _mm_cmple_ps(dstreg,srcreg)
319: #define CMPUNORD_PS(dstreg,srcreg) dstreg = _mm_cmpunord_ps(dstreg,srcreg)
320: #define CMPNEQ_PS(dstreg,srcreg) dstreg = _mm_cmpneq_ps(dstreg,srcreg)
321: #define CMPNLT_PS(dstreg,srcreg) dstreg = _mm_cmpnlt_ps(dstreg,srcreg)
322: #define CMPNLE_PS(dstreg,srcreg) dstreg = _mm_cmpnle_ps(dstreg,srcreg)
323: #define CMPORD_PS(dstreg,srcreg) dstreg = _mm_cmpord_ps(dstreg,srcreg)
325: /* ================================================================================================ */
328: #endif