summaryrefslogtreecommitdiff
path: root/core/multimedia/opieplayer/libmpeg3/video/mmxidct.S
Unidiff
Diffstat (limited to 'core/multimedia/opieplayer/libmpeg3/video/mmxidct.S') (more/less context) (show whitespace changes)
-rw-r--r--core/multimedia/opieplayer/libmpeg3/video/mmxidct.S675
1 files changed, 675 insertions, 0 deletions
diff --git a/core/multimedia/opieplayer/libmpeg3/video/mmxidct.S b/core/multimedia/opieplayer/libmpeg3/video/mmxidct.S
new file mode 100644
index 0000000..9c3bebe
--- a/dev/null
+++ b/core/multimedia/opieplayer/libmpeg3/video/mmxidct.S
@@ -0,0 +1,675 @@
1/*
2 * the input data is tranposed and each 16 bit element in the 8x8 matrix
3 * is left aligned:
4 * for example in 11...1110000 format
5 * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component
6 * (element[0][0] of the matrix)
7 */
8
9/* extrn re_matrix */
10
11/* constants */
12
13.data
14 .align 16
15 .type preSC, @object
16preSC: .short 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520
17 .short 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270
18 .short 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906
19 .short 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315
20 .short 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520
21 .short 12873, 17855, 16819, 15137, 25746, 20228, 13933, 7103
22 .short 17734, 24598, 23170, 20853, 17734, 13933, 9597, 4892
23 .short 18081, 25080, 23624, 21261, 18081, 14206, 9785, 4988
24 .sizepreSC, 128
25 .align 8
26 .typex0005000200010001, @object
27 .sizex0005000200010001, 8
28x0005000200010001:
29 .long0x00010001, 0x00050002
30 .align 8
31 .typex0040000000000000, @object
32 .sizex0040000000000000, 8
33x0040000000000000:
34 .long0, 0x00400000
35 .align 8
36 .typex5a825a825a825a82, @object
37 .sizex5a825a825a825a82, 8
38x5a825a825a825a82:
39 .long0x5a825a82, 0x5a825a82
40 .align 8
41 .typex539f539f539f539f, @object
42 .sizex539f539f539f539f, 8
43x539f539f539f539f:
44 .long0x539f539f, 0x539f539f
45 .align 8
46 .typex4546454645464546, @object
47 .sizex4546454645464546, 8
48x4546454645464546:
49 .long0x45464546, 0x45464546
50 .align 8
51 .typex61f861f861f861f8, @object
52 .sizex61f861f861f861f8, 8
53x61f861f861f861f8:
54 .long0x61f861f8, 0x61f861f8
55/* Static variables */
56 .align 8
57 .type x0, @object
58 .size x0, 8
59x0:
60 .long 0, 0
61/* Procedure */
62
63
64 .align 8
65.text
66 .align 4
67.globl IDCT_mmx
68 .type IDCT_mmx, @function
69IDCT_mmx:
70 pushl %ebp
71 movl %esp, %ebp
72 pushl %ebx
73 pushl %ecx
74 pushl %edx
75 pushl %esi
76 pushl %edi
77
78 pushl $0 /* allocate the temp variables */
79 pushl $0
80 pushl $0
81 pushl $0
82 pushl $0
83 pushl $0
84 pushl $0
85 pushl $0
86
87 movl 8(%ebp), %esi /* source matrix */
88 leal preSC, %ecx
89/* column 0: even part
90 * use V4, V12, V0, V8 to produce V22..V25
91 */
92 movq 8*12(%ecx), %mm0 /* maybe the first mul can be done together */
93 /* with the dequantization in iHuff module */
94 pmulhw 8*12(%esi), %mm0 /* V12 */
95 movq 8*4(%ecx), %mm1
96 pmulhw 8*4(%esi), %mm1 /* V4 */
97 movq (%ecx), %mm3
98 psraw $1, %mm0 /* t64=t66 */
99 pmulhw (%esi), %mm3 /* V0 */
100 movq 8*8(%ecx), %mm5 /* duplicate V4 */
101 movq %mm1, %mm2 /* added 11/1/96 */
102 pmulhw 8*8(%esi),%mm5 /* V8 */
103 psubsw %mm0, %mm1 /* V16 */
104 pmulhw x5a825a825a825a82, %mm1/* 23170 ->V18 */
105 paddsw %mm0, %mm2 /* V17 */
106 movq %mm2, %mm0 /* duplicate V17 */
107 psraw $1, %mm2 /* t75=t82 */
108 psraw $2, %mm0 /* t72 */
109 movq %mm3, %mm4 /* duplicate V0 */
110 paddsw %mm5, %mm3 /* V19 */
111 psubsw %mm5, %mm4 /* V20 ;mm5 free */
112/* moved from the block below */
113 movq 8*10(%ecx), %mm7
114 psraw $1, %mm3 /* t74=t81 */
115 movq %mm3, %mm6 /* duplicate t74=t81 */
116 psraw $2, %mm4 /* t77=t79 */
117 psubsw %mm0, %mm1 /* V21 ; mm0 free */
118 paddsw %mm2, %mm3 /* V22 */
119 movq %mm1, %mm5 /* duplicate V21 */
120 paddsw %mm4, %mm1 /* V23 */
121 movq %mm3, 8*4(%esi) /* V22 */
122 psubsw %mm5, %mm4 /* V24; mm5 free */
123 movq %mm1, 8*12(%esi) /* V23 */
124 psubsw %mm2, %mm6 /* V25; mm2 free */
125 movq %mm4, (%esi) /* V24 */
126/* keep mm6 alive all along the next block */
127 /* movq %mm6, 8*8(%esi) V25 */
128/* column 0: odd part
129 * use V2, V6, V10, V14 to produce V31, V39, V40, V41
130 */
131/* moved above: movq 8*10(%ecx), %mm7 */
132
133 pmulhw 8*10(%esi), %mm7 /* V10 */
134 movq 8*6(%ecx), %mm0
135 pmulhw 8*6(%esi), %mm0 /* V6 */
136 movq 8*2(%ecx), %mm5
137 movq %mm7, %mm3 /* duplicate V10 */
138 pmulhw 8*2(%esi), %mm5 /* V2 */
139 movq 8*14(%ecx), %mm4
140 psubsw %mm0, %mm7 /* V26 */
141 pmulhw 8*14(%esi), %mm4 /* V14 */
142 paddsw %mm0, %mm3 /* V29 ; free mm0 */
143 movq %mm7, %mm1 /* duplicate V26 */
144 psraw $1, %mm3 /* t91=t94 */
145 pmulhw x539f539f539f539f,%mm7/* V33 */
146 psraw $1, %mm1 /* t96 */
147 movq %mm5, %mm0 /* duplicate V2 */
148 psraw $2, %mm4 /* t85=t87 */
149 paddsw %mm4,%mm5 /* V27 */
150 psubsw %mm4, %mm0 /* V28 ; free mm4 */
151 movq %mm0, %mm2 /* duplicate V28 */
152 psraw $1, %mm5 /* t90=t93 */
153 pmulhw x4546454645464546,%mm0/* V35 */
154 psraw $1, %mm2 /* t97 */
155 movq %mm5, %mm4 /* duplicate t90=t93 */
156 psubsw %mm2, %mm1 /* V32 ; free mm2 */
157 pmulhw x61f861f861f861f8,%mm1/* V36 */
158 psllw $1, %mm7 /* t107 */
159 paddsw %mm3, %mm5 /* V31 */
160 psubsw %mm3, %mm4 /* V30 ; free mm3 */
161 pmulhw x5a825a825a825a82,%mm4/* V34 */
162 nop
163 psubsw %mm1, %mm0 /* V38 */
164 psubsw %mm7, %mm1 /* V37 ; free mm7 */
165 psllw $1, %mm1 /* t114 */
166/* move from the next block */
167 movq %mm6, %mm3 /* duplicate V25 */
168/* move from the next block */
169 movq 8*4(%esi), %mm7 /* V22 */
170 psllw $1, %mm0 /* t110 */
171 psubsw %mm5, %mm0 /* V39 (mm5 needed for next block) */
172 psllw $2, %mm4 /* t112 */
173/* moved from the next block */
174 movq 8*12(%esi), %mm2 /* V23 */
175 psubsw %mm0, %mm4 /* V40 */
176 paddsw %mm4, %mm1 /* V41; free mm0 */
177/* moved from the next block */
178 psllw $1, %mm2 /* t117=t125 */
179/* column 0: output butterfly */
180/* moved above:
181 * movq %mm6, %mm3 duplicate V25
182 * movq 8*4(%esi), %mm7 V22
183 * movq 8*12(%esi), %mm2 V23
184 * psllw $1, %mm2 t117=t125
185 */
186 psubsw %mm1, %mm6 /* tm6 */
187 paddsw %mm1, %mm3 /* tm8; free mm1 */
188 movq %mm7, %mm1 /* duplicate V22 */
189 paddsw %mm5, %mm7 /* tm0 */
190 movq %mm3, 8*8(%esi) /* tm8; free mm3 */
191 psubsw %mm5, %mm1 /* tm14; free mm5 */
192 movq %mm6, 8*6(%esi) /* tm6; free mm6 */
193 movq %mm2, %mm3 /* duplicate t117=t125 */
194 movq (%esi), %mm6 /* V24 */
195 paddsw %mm0, %mm2 /* tm2 */
196 movq %mm7, (%esi) /* tm0; free mm7 */
197 psubsw %mm0, %mm3 /* tm12; free mm0 */
198 movq %mm1, 8*14(%esi) /* tm14; free mm1 */
199 psllw $1, %mm6 /* t119=t123 */
200 movq %mm2, 8*2(%esi) /* tm2; free mm2 */
201 movq %mm6, %mm0 /* duplicate t119=t123 */
202 movq %mm3, 8*12(%esi) /* tm12; free mm3 */
203 paddsw %mm4, %mm6 /* tm4 */
204/* moved from next block */
205 movq 8*5(%ecx), %mm1
206 psubsw %mm4, %mm0 /* tm10; free mm4 */
207/* moved from next block */
208 pmulhw 8*5(%esi), %mm1 /* V5 */
209 movq %mm6, 8*4(%esi) /* tm4; free mm6 */
210 movq %mm0, 8*10(%esi) /* tm10; free mm0 */
211/* column 1: even part
212 * use V5, V13, V1, V9 to produce V56..V59
213 */
214/* moved to prev block:
215 *movq 8*5(%ecx), %mm1
216 * pmulhw 8*5(%esi), %mm1 V5
217 */
218 movq 8*13(%ecx), %mm7
219 psllw $1, %mm1 /* t128=t130 */
220 pmulhw 8*13(%esi), %mm7 /* V13 */
221 movq %mm1, %mm2 /* duplicate t128=t130 */
222 movq 8(%ecx), %mm3
223 pmulhw 8(%esi), %mm3 /* V1 */
224 movq 8*9(%ecx), %mm5
225 psubsw %mm7, %mm1 /* V50 */
226 pmulhw 8*9(%esi), %mm5 /* V9 */
227 paddsw %mm7, %mm2 /* V51 */
228 pmulhw x5a825a825a825a82, %mm1/* 23170 ->V52 */
229 movq %mm2, %mm6 /* duplicate V51 */
230 psraw $1, %mm2 /* t138=t144 */
231 movq %mm3, %mm4 /* duplicate V1 */
232 psraw $2, %mm6 /* t136 */
233 paddsw %mm5, %mm3 /* V53 */
234 psubsw %mm5, %mm4 /* V54 ;mm5 free */
235 movq %mm3, %mm7 /* duplicate V53 */
236/* moved from next block */
237 movq 8*11(%ecx), %mm0
238 psraw $1, %mm4 /* t140=t142 */
239 psubsw %mm6, %mm1 /* V55 ; mm6 free */
240 paddsw %mm2, %mm3 /* V56 */
241 movq %mm4, %mm5 /* duplicate t140=t142 */
242 paddsw %mm1, %mm4 /* V57 */
243 movq %mm3, 8*5(%esi) /* V56 */
244 psubsw %mm1, %mm5 /* V58; mm1 free */
245 movq %mm4, 8*13(%esi) /* V57 */
246 psubsw %mm2, %mm7 /* V59; mm2 free */
247 movq %mm5, 8*9(%esi) /* V58 */
248/* keep mm7 alive all along the next block
249 * movq %mm7, 8(%esi) V59
250 * moved above
251 *movq 8*11(%ecx), %mm0
252 */
253 pmulhw 8*11(%esi), %mm0 /* V11 */
254 movq 8*7(%ecx), %mm6
255 pmulhw 8*7(%esi), %mm6 /* V7 */
256 movq 8*15(%ecx), %mm4
257 movq %mm0, %mm3 /* duplicate V11 */
258 pmulhw 8*15(%esi), %mm4 /* V15 */
259 movq 8*3(%ecx), %mm5
260 psllw $1, %mm6 /* t146=t152 */
261 pmulhw 8*3(%esi), %mm5 /* V3 */
262 paddsw %mm6, %mm0 /* V63 */
263/* note that V15 computation has a correction step:
264 * this is a 'magic' constant that rebiases the results to be closer to the
265 * expected result. this magic constant can be refined to reduce the error
266 * even more by doing the correction step in a later stage when the number
267 * is actually multiplied by 16
268 */
269 paddw x0005000200010001, %mm4
270 psubsw %mm6, %mm3 /* V60 ; free mm6 */
271 psraw $1, %mm0 /* t154=t156 */
272 movq %mm3, %mm1 /* duplicate V60 */
273 pmulhw x539f539f539f539f, %mm1/* V67 */
274 movq %mm5, %mm6 /* duplicate V3 */
275 psraw $2, %mm4 /* t148=t150 */
276 paddsw %mm4, %mm5 /* V61 */
277 psubsw %mm4, %mm6 /* V62 ; free mm4 */
278 movq %mm5, %mm4 /* duplicate V61 */
279 psllw $1, %mm1 /* t169 */
280 paddsw %mm0, %mm5 /* V65 -> result */
281 psubsw %mm0, %mm4 /* V64 ; free mm0 */
282 pmulhw x5a825a825a825a82, %mm4/* V68 */
283 psraw $1, %mm3 /* t158 */
284 psubsw %mm6, %mm3 /* V66 */
285 movq %mm5, %mm2 /* duplicate V65 */
286 pmulhw x61f861f861f861f8, %mm3/* V70 */
287 psllw $1, %mm6 /* t165 */
288 pmulhw x4546454645464546, %mm6/* V69 */
289 psraw $1, %mm2 /* t172 */
290/* moved from next block */
291 movq 8*5(%esi), %mm0 /* V56 */
292 psllw $1, %mm4 /* t174 */
293/* moved from next block */
294 psraw $1, %mm0 /* t177=t188 */
295 nop
296 psubsw %mm3, %mm6 /* V72 */
297 psubsw %mm1, %mm3 /* V71 ; free mm1 */
298 psubsw %mm2, %mm6 /* V73 ; free mm2 */
299/* moved from next block */
300 psraw $1, %mm5 /* t178=t189 */
301 psubsw %mm6, %mm4 /* V74 */
302/* moved from next block */
303 movq %mm0, %mm1 /* duplicate t177=t188 */
304 paddsw %mm4, %mm3 /* V75 */
305/* moved from next block */
306 paddsw %mm5, %mm0 /* tm1 */
307/* location
308 * 5 - V56
309 * 13 - V57
310 * 9 - V58
311 * X - V59, mm7
312 * X - V65, mm5
313 * X - V73, mm6
314 * X - V74, mm4
315 * X - V75, mm3
316 * free mm0, mm1 & mm2
317 * moved above
318 * movq 8*5(%esi), %mm0 V56
319 * psllw $1, %mm0 t177=t188 ! new !!
320 * psllw $1, %mm5 t178=t189 ! new !!
321 * movq %mm0, %mm1 duplicate t177=t188
322 * paddsw %mm5, %mm0 tm1
323 */
324 movq 8*13(%esi), %mm2 /* V57 */
325 psubsw %mm5, %mm1 /* tm15; free mm5 */
326 movq %mm0, 8(%esi) /* tm1; free mm0 */
327 psraw $1, %mm7 /* t182=t184 ! new !! */
328/* save the store as used directly in the transpose
329 * movq %mm1, 120(%esi) tm15; free mm1
330 */
331 movq %mm7, %mm5 /* duplicate t182=t184 */
332 psubsw %mm3, %mm7 /* tm7 */
333 paddsw %mm3, %mm5 /* tm9; free mm3 */
334 movq 8*9(%esi), %mm0 /* V58 */
335 movq %mm2, %mm3 /* duplicate V57 */
336 movq %mm7, 8*7(%esi) /* tm7; free mm7 */
337 psubsw %mm6, %mm3 /* tm13 */
338 paddsw %mm6, %mm2 /* tm3 ; free mm6 */
339/* moved up from the transpose */
340 movq %mm3, %mm7
341/* moved up from the transpose */
342 punpcklwd %mm1, %mm3
343 movq %mm0, %mm6 /* duplicate V58 */
344 movq %mm2, 8*3(%esi) /* tm3; free mm2 */
345 paddsw %mm4, %mm0 /* tm5 */
346 psubsw %mm4, %mm6 /* tm11; free mm4 */
347/* moved up from the transpose */
348 punpckhwd %mm1, %mm7
349 movq %mm0, 8*5(%esi) /* tm5; free mm0 */
350/* moved up from the transpose */
351 movq %mm5, %mm2
352/* transpose - M4 part
353 * --------- ---------
354 * | M1 | M2 | | M1'| M3'|
355 * --------- --> ---------
356 * | M3 | M4 | | M2'| M4'|
357 * --------- ---------
358 * Two alternatives: use full mmword approach so the following code can be
359 * scheduled before the transpose is done without stores, or use the faster
360 * half mmword stores (when possible)
361 */
362 movd %mm3, 8*9+4(%esi) /* MS part of tmt9 */
363 punpcklwd %mm6, %mm5
364 movd %mm7, 8*13+4(%esi) /* MS part of tmt13 */
365 punpckhwd %mm6, %mm2
366 movd %mm5, 8*9(%esi) /* LS part of tmt9 */
367 punpckhdq %mm3, %mm5 /* free mm3 */
368 movd %mm2, 8*13(%esi) /* LS part of tmt13 */
369 punpckhdq %mm7, %mm2 /* free mm7 */
370/* moved up from the M3 transpose */
371 movq 8*8(%esi), %mm0
372/* moved up from the M3 transpose */
373 movq 8*10(%esi), %mm1
374/* moved up from the M3 transpose */
375 movq %mm0, %mm3
376/* shuffle the rest of the data, and write it with 2 mmword writes */
377 movq %mm5, 8*11(%esi) /* tmt11 */
378/* moved up from the M3 transpose */
379 punpcklwd %mm1, %mm0
380 movq %mm2, 8*15(%esi) /* tmt15 */
381/* moved up from the M3 transpose */
382 punpckhwd %mm1, %mm3
383/* transpose - M3 part
384 * moved up to previous code section
385 *movq 8*8(%esi), %mm0
386 *movq 8*10(%esi), %mm1
387 *movq %mm0, %mm3
388 *punpcklwd %mm1, %mm0
389 *punpckhwd %mm1, %mm3
390 */
391 movq 8*12(%esi), %mm6
392 movq 8*14(%esi), %mm4
393 movq %mm6, %mm2
394/* shuffle the data and write the lower parts of the transposed in 4 dwords */
395 punpcklwd %mm4, %mm6
396 movq %mm0, %mm1
397 punpckhdq %mm6, %mm1
398 movq %mm3, %mm7
399 punpckhwd %mm4, %mm2 /* free mm4 */
400 punpckldq %mm6, %mm0 /* free mm6 */
401/* moved from next block */
402 movq 8*13(%esi), %mm4 /* tmt13 */
403 punpckldq %mm2, %mm3
404 punpckhdq %mm2, %mm7 /* free mm2 */
405/* moved from next block */
406 movq %mm3, %mm5 /* duplicate tmt5 */
407/* column 1: even part (after transpose)
408* moved above
409 * movq %mm3, %mm5 duplicate tmt5
410 * movq 8*13(%esi), %mm4 tmt13
411*/
412 psubsw %mm4, %mm3 /* V134 */
413 pmulhw x5a825a825a825a82, %mm3/* 23170 ->V136 */
414 movq 8*9(%esi), %mm6 /* tmt9 */
415 paddsw %mm4, %mm5 /* V135 ; mm4 free */
416 movq %mm0, %mm4 /* duplicate tmt1 */
417 paddsw %mm6, %mm0 /* V137 */
418 psubsw %mm6, %mm4 /* V138 ; mm6 free */
419 psllw $2, %mm3 /* t290 */
420 psubsw %mm5, %mm3 /* V139 */
421 movq %mm0, %mm6 /* duplicate V137 */
422 paddsw %mm5, %mm0 /* V140 */
423 movq %mm4, %mm2 /* duplicate V138 */
424 paddsw %mm3, %mm2 /* V141 */
425 psubsw %mm3, %mm4 /* V142 ; mm3 free */
426 movq %mm0, 8*9(%esi) /* V140 */
427 psubsw %mm5, %mm6 /* V143 ; mm5 free */
428/* moved from next block */
429 movq 8*11(%esi), %mm0 /* tmt11 */
430 movq %mm2, 8*13(%esi) /* V141 */
431/* moved from next block */
432 movq %mm0, %mm2 /* duplicate tmt11 */
433/* column 1: odd part (after transpose) */
434/* moved up to the prev block
435 * movq 8*11(%esi), %mm0 tmt11
436 * movq %mm0, %mm2 duplicate tmt11
437 */
438 movq 8*15(%esi), %mm5 /* tmt15 */
439 psubsw %mm7, %mm0 /* V144 */
440 movq %mm0, %mm3 /* duplicate V144 */
441 paddsw %mm7, %mm2 /* V147 ; free mm7 */
442 pmulhw x539f539f539f539f, %mm0/* 21407-> V151 */
443 movq %mm1, %mm7 /* duplicate tmt3 */
444 paddsw %mm5, %mm7 /* V145 */
445 psubsw %mm5, %mm1 /* V146 ; free mm5 */
446 psubsw %mm1, %mm3 /* V150 */
447 movq %mm7, %mm5 /* duplicate V145 */
448 pmulhw x4546454645464546, %mm1/* 17734-> V153 */
449 psubsw %mm2, %mm5 /* V148 */
450 pmulhw x61f861f861f861f8, %mm3/* 25080-> V154 */
451 psllw $2, %mm0 /* t311 */
452 pmulhw x5a825a825a825a82, %mm5/* 23170-> V152 */
453 paddsw %mm2, %mm7 /* V149 ; free mm2 */
454 psllw $1, %mm1 /* t313 */
455 nop/* without the nop - freeze here for one clock */
456 movq %mm3, %mm2 /* duplicate V154 */
457 psubsw %mm0, %mm3 /* V155 ; free mm0 */
458 psubsw %mm2, %mm1 /* V156 ; free mm2 */
459/* moved from the next block */
460 movq %mm6, %mm2 /* duplicate V143 */
461/* moved from the next block */
462 movq 8*13(%esi), %mm0 /* V141 */
463 psllw $1, %mm1 /* t315 */
464 psubsw %mm7, %mm1 /* V157 (keep V149) */
465 psllw $2, %mm5 /* t317 */
466 psubsw %mm1, %mm5 /* V158 */
467 psllw $1, %mm3 /* t319 */
468 paddsw %mm5, %mm3 /* V159 */
469/* column 1: output butterfly (after transform)
470 * moved to the prev block
471 * movq %mm6, %mm2 duplicate V143
472 * movq 8*13(%esi), %mm0 V141
473 */
474 psubsw %mm3, %mm2 /* V163 */
475 paddsw %mm3, %mm6 /* V164 ; free mm3 */
476 movq %mm4, %mm3 /* duplicate V142 */
477 psubsw %mm5, %mm4 /* V165 ; free mm5 */
478 movq %mm2, (%esp) /* out7 */
479 psraw $4, %mm6
480 psraw $4, %mm4
481 paddsw %mm5, %mm3 /* V162 */
482 movq 8*9(%esi), %mm2 /* V140 */
483 movq %mm0, %mm5 /* duplicate V141 */
484/* in order not to perculate this line up,
485 * we read 72(%esi) very near to this location
486 */
487 movq %mm6, 8*9(%esi) /* out9 */
488 paddsw %mm1, %mm0 /* V161 */
489 movq %mm3, 8(%esp) /* out5 */
490 psubsw %mm1, %mm5 /* V166 ; free mm1 */
491 movq %mm4, 8*11(%esi) /* out11 */
492 psraw $4, %mm5
493 movq %mm0, 16(%esp) /* out3 */
494 movq %mm2, %mm4 /* duplicate V140 */
495 movq %mm5, 8*13(%esi) /* out13 */
496 paddsw %mm7, %mm2 /* V160 */
497/* moved from the next block */
498 movq 8(%esi), %mm0
499 psubsw %mm7, %mm4 /* V167 ; free mm7 */
500/* moved from the next block */
501 movq 8*3(%esi), %mm7
502 psraw $4, %mm4
503 movq %mm2, 24(%esp) /* out1 */
504/* moved from the next block */
505 movq %mm0, %mm1
506 movq %mm4, 8*15(%esi) /* out15 */
507/* moved from the next block */
508 punpcklwd %mm7, %mm0
509/* transpose - M2 parts
510 * moved up to the prev block
511 *movq 8(%esi), %mm0
512 *movq 8*3(%esi), %mm7
513 *movq %mm0, %mm1
514 *punpcklwd %mm7, %mm0
515 */
516 movq 8*5(%esi), %mm5
517 punpckhwd %mm7, %mm1
518 movq 8*7(%esi), %mm4
519 movq %mm5, %mm3
520/* shuffle the data and write the lower parts of the trasposed in 4 dwords */
521 movd %mm0, 8*8(%esi) /* LS part of tmt8 */
522 punpcklwd %mm4, %mm5
523 movd %mm1, 8*12(%esi) /* LS part of tmt12 */
524 punpckhwd %mm4, %mm3
525 movd %mm5, 8*8+4(%esi) /* MS part of tmt8 */
526 punpckhdq %mm5, %mm0 /* tmt10 */
527 movd %mm3, 8*12+4(%esi) /* MS part of tmt12 */
528 punpckhdq %mm3, %mm1 /* tmt14 */
529/* transpose - M1 parts */
530 movq (%esi), %mm7
531 movq 8*2(%esi), %mm2
532 movq %mm7, %mm6
533 movq 8*4(%esi), %mm5
534 punpcklwd %mm2, %mm7
535 movq 8*6(%esi), %mm4
536 punpckhwd %mm2, %mm6 /* free mm2 */
537 movq %mm5, %mm3
538 punpcklwd %mm4, %mm5
539 punpckhwd %mm4, %mm3 /* free mm4 */
540 movq %mm7, %mm2
541 movq %mm6, %mm4
542 punpckldq %mm5, %mm7 /* tmt0 */
543 punpckhdq %mm5, %mm2 /* tmt2 ; free mm5 */
544/* shuffle the rest of the data, and write it with 2 mmword writes */
545 punpckldq %mm3, %mm6 /* tmt4 */
546/* moved from next block */
547 movq %mm2, %mm5 /* duplicate tmt2 */
548 punpckhdq %mm3, %mm4 /* tmt6 ; free mm3 */
549/* moved from next block */
550 movq %mm0, %mm3 /* duplicate tmt10 */
551/* column 0: odd part (after transpose)
552 *moved up to prev block
553 * movq %mm0, %mm3 duplicate tmt10
554 * movq %mm2, %mm5 duplicate tmt2
555 */
556 psubsw %mm4, %mm0 /* V110 */
557 paddsw %mm4, %mm3 /* V113 ; free mm4 */
558 movq %mm0, %mm4 /* duplicate V110 */
559 paddsw %mm1, %mm2 /* V111 */
560 pmulhw x539f539f539f539f, %mm0/* 21407-> V117 */
561 psubsw %mm1, %mm5 /* V112 ; free mm1 */
562 psubsw %mm5, %mm4 /* V116 */
563 movq %mm2, %mm1 /* duplicate V111 */
564 pmulhw x4546454645464546, %mm5/* 17734-> V119 */
565 psubsw %mm3, %mm2 /* V114 */
566 pmulhw x61f861f861f861f8, %mm4/* 25080-> V120 */
567 paddsw %mm3, %mm1 /* V115 ; free mm3 */
568 pmulhw x5a825a825a825a82, %mm2/* 23170-> V118 */
569 psllw $2, %mm0 /* t266 */
570 movq %mm1, (%esi) /* save V115 */
571 psllw $1, %mm5 /* t268 */
572 psubsw %mm4, %mm5 /* V122 */
573 psubsw %mm0, %mm4 /* V121 ; free mm0 */
574 psllw $1, %mm5 /* t270 */
575 psubsw %mm1, %mm5 /* V123 ; free mm1 */
576 psllw $2, %mm2 /* t272 */
577 psubsw %mm5, %mm2 /* V124 (keep V123) */
578 psllw $1, %mm4 /* t274 */
579 movq %mm5, 8*2(%esi) /* save V123 ; free mm5 */
580 paddsw %mm2, %mm4 /* V125 (keep V124) */
581/* column 0: even part (after transpose) */
582 movq 8*12(%esi), %mm0 /* tmt12 */
583 movq %mm6, %mm3 /* duplicate tmt4 */
584 psubsw %mm0, %mm6 /* V100 */
585 paddsw %mm0, %mm3 /* V101 ; free mm0 */
586 pmulhw x5a825a825a825a82, %mm6/* 23170 ->V102 */
587 movq %mm7, %mm5 /* duplicate tmt0 */
588 movq 8*8(%esi), %mm1 /* tmt8 */
589 paddsw %mm1, %mm7 /* V103 */
590 psubsw %mm1, %mm5 /* V104 ; free mm1 */
591 movq %mm7, %mm0 /* duplicate V103 */
592 psllw $2, %mm6 /* t245 */
593 paddsw %mm3, %mm7 /* V106 */
594 movq %mm5, %mm1 /* duplicate V104 */
595 psubsw %mm3, %mm6 /* V105 */
596 psubsw %mm3, %mm0 /* V109; free mm3 */
597 paddsw %mm6, %mm5 /* V107 */
598 psubsw %mm6, %mm1 /* V108 ; free mm6 */
599/* column 0: output butterfly (after transform) */
600 movq %mm1, %mm3 /* duplicate V108 */
601 paddsw %mm2, %mm1 /* out4 */
602 psraw $4, %mm1
603 psubsw %mm2, %mm3 /* out10 ; free mm2 */
604 psraw $4, %mm3
605 movq %mm0, %mm6 /* duplicate V109 */
606 movq %mm1, 8*4(%esi) /* out4 ; free mm1 */
607 psubsw %mm4, %mm0 /* out6 */
608 movq %mm3, 8*10(%esi) /* out10 ; free mm3 */
609 psraw $4, %mm0
610 paddsw %mm4, %mm6 /* out8 ; free mm4 */
611 movq %mm7, %mm1 /* duplicate V106 */
612 movq %mm0, 8*6(%esi) /* out6 ; free mm0 */
613 psraw $4, %mm6
614 movq (%esi), %mm4 /* V115 */
615 movq %mm6, 8*8(%esi) /* out8 ; free mm6 */
616 movq %mm5, %mm2 /* duplicate V107 */
617 movq 8*2(%esi), %mm3 /* V123 */
618 paddsw %mm4, %mm7 /* out0 */
619/* moved up from next block */
620 movq 16(%esp), %mm0
621 psraw $4, %mm7
622/* moved up from next block */
623 movq 8(%esp), %mm6
624 psubsw %mm4, %mm1 /* out14 ; free mm4 */
625 paddsw %mm3, %mm5 /* out2 */
626 psraw $4, %mm1
627 movq %mm7, (%esi) /* out0 ; free mm7 */
628 psraw $4, %mm5
629 movq %mm1, 8*14(%esi) /* out14 ; free mm1 */
630 psubsw %mm3, %mm2 /* out12 ; free mm3 */
631 movq %mm5, 8*2(%esi) /* out2 ; free mm5 */
632 psraw $4, %mm2
633/* moved up to the prev block */
634 movq (%esp), %mm4
635/* moved up to the prev block */
636 psraw $4, %mm0
637 movq %mm2, 8*12(%esi) /* out12 ; free mm2 */
638/* moved up to the prev block */
639 psraw $4, %mm6
640/* move back the data to its correct place
641* moved up to the prev block
642 *movq 16(%esp), %mm0
643 *movq 8(%esp), %mm6
644 *movq (%esp), %mm4
645 *psraw $4, %mm0
646 *psraw $4, %mm6
647*/
648 movq 24(%esp), %mm1
649 psraw $4, %mm4
650 movq %mm0, 8*3(%esi) /* out3 */
651 psraw $4, %mm1
652 movq %mm6, 8*5(%esi) /* out5 */
653 movq %mm4, 8*7(%esi) /* out7 */
654 movq %mm1, 8(%esi) /* out1 */
655
656 popl %edi /* Pop off the temp variables */
657 popl %edi
658 popl %edi
659 popl %edi
660 popl %edi
661 popl %edi
662 popl %edi
663 popl %edi
664
665 popl %edi /* Pop off the old variables */
666 popl %esi
667 popl %edx
668 popl %ecx
669 popl %ebx
670 movl %ebp, %esp
671 popl %ebp
672
673 ret
674.Lfe1:
675 .size IDCT_mmx,.Lfe1-IDCT_mmx