aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/ffmpeg/libswscale
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/ffmpeg/libswscale')
-rw-r--r--src/plugins/ffmpeg/libswscale/Makefile25
-rw-r--r--src/plugins/ffmpeg/libswscale/cs_test.c175
-rw-r--r--src/plugins/ffmpeg/libswscale/internal_bfin.S606
-rw-r--r--src/plugins/ffmpeg/libswscale/rgb2rgb.c534
-rw-r--r--src/plugins/ffmpeg/libswscale/rgb2rgb.h142
-rw-r--r--src/plugins/ffmpeg/libswscale/rgb2rgb_template.c2738
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale-example.c230
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale.c2934
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale.h146
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale_altivec_template.c538
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale_avoption.c59
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale_bfin.c94
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale_internal.h283
-rw-r--r--src/plugins/ffmpeg/libswscale/swscale_template.c3295
-rw-r--r--src/plugins/ffmpeg/libswscale/yuv2rgb.c887
-rw-r--r--src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c965
-rw-r--r--src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c206
-rw-r--r--src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c85
-rw-r--r--src/plugins/ffmpeg/libswscale/yuv2rgb_template.c538
-rw-r--r--src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c207
20 files changed, 0 insertions, 14687 deletions
diff --git a/src/plugins/ffmpeg/libswscale/Makefile b/src/plugins/ffmpeg/libswscale/Makefile
deleted file mode 100644
index 93d27ba..0000000
--- a/src/plugins/ffmpeg/libswscale/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
1include $(SUBDIR)../config.mak
2
3NAME = swscale
4FFLIBS = avutil
5
6OBJS = rgb2rgb.o swscale.o swscale_avoption.o
7
8OBJS-$(ARCH_BFIN) += swscale_bfin.o yuv2rgb_bfin.o
9OBJS-$(CONFIG_GPL) += yuv2rgb.o
10OBJS-$(CONFIG_MLIB) += yuv2rgb_mlib.o
11OBJS-$(HAVE_ALTIVEC) += yuv2rgb_altivec.o
12OBJS-$(HAVE_VIS) += yuv2rgb_vis.o
13
14ASM_OBJS-$(ARCH_BFIN) += internal_bfin.o
15
16HEADERS = swscale.h rgb2rgb.h
17
18CLEANFILES = cs_test swscale-example
19
20include $(SUBDIR)../subdir.mak
21
22$(SUBDIR)cs_test: $(SUBDIR)cs_test.o $(SUBDIR)$(LIBNAME)
23
24$(SUBDIR)swscale-example: $(SUBDIR)swscale-example.o $(SUBDIR)$(LIBNAME)
25$(SUBDIR)swscale-example: EXTRALIBS += -lm
diff --git a/src/plugins/ffmpeg/libswscale/cs_test.c b/src/plugins/ffmpeg/libswscale/cs_test.c
deleted file mode 100644
index d49a605..0000000
--- a/src/plugins/ffmpeg/libswscale/cs_test.c
+++ /dev/null
@@ -1,175 +0,0 @@
1/*
2 * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include <stdio.h>
22#include <string.h> /* for memset() */
23#include <unistd.h>
24#include <stdlib.h>
25#include <inttypes.h>
26
27#include "swscale.h"
28#include "rgb2rgb.h"
29
30#define SIZE 1000
31#define srcByte 0x55
32#define dstByte 0xBB
33
34#define FUNC(s,d,n) {s,d,#n,n}
35
36static int cpu_caps;
37
38static char *args_parse(int argc, char *argv[])
39{
40 int o;
41
42 while ((o = getopt(argc, argv, "m23")) != -1) {
43 switch (o) {
44 case 'm':
45 cpu_caps |= SWS_CPU_CAPS_MMX;
46 break;
47 case '2':
48 cpu_caps |= SWS_CPU_CAPS_MMX2;
49 break;
50 case '3':
51 cpu_caps |= SWS_CPU_CAPS_3DNOW;
52 break;
53 default:
54 av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o);
55 }
56 }
57
58 return argv[optind];
59}
60
61int main(int argc, char **argv)
62{
63 int i, funcNum;
64 uint8_t *srcBuffer= (uint8_t*)av_malloc(SIZE);
65 uint8_t *dstBuffer= (uint8_t*)av_malloc(SIZE);
66 int failedNum=0;
67 int passedNum=0;
68
69 av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
70 args_parse(argc, argv);
71 av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
72 sws_rgb2rgb_init(cpu_caps);
73
74 for(funcNum=0; ; funcNum++){
75 struct func_info_s {
76 int src_bpp;
77 int dst_bpp;
78 char *name;
79 void (*func)(const uint8_t *src, uint8_t *dst, long src_size);
80 } func_info[] = {
81 FUNC(2, 2, rgb15to16),
82 FUNC(2, 3, rgb15to24),
83 FUNC(2, 4, rgb15to32),
84 FUNC(2, 3, rgb16to24),
85 FUNC(2, 4, rgb16to32),
86 FUNC(3, 2, rgb24to15),
87 FUNC(3, 2, rgb24to16),
88 FUNC(3, 4, rgb24to32),
89 FUNC(4, 2, rgb32to15),
90 FUNC(4, 2, rgb32to16),
91 FUNC(4, 3, rgb32to24),
92 FUNC(2, 2, rgb16to15),
93 FUNC(2, 2, rgb15tobgr15),
94 FUNC(2, 2, rgb15tobgr16),
95 FUNC(2, 3, rgb15tobgr24),
96 FUNC(2, 4, rgb15tobgr32),
97 FUNC(2, 2, rgb16tobgr15),
98 FUNC(2, 2, rgb16tobgr16),
99 FUNC(2, 3, rgb16tobgr24),
100 FUNC(2, 4, rgb16tobgr32),
101 FUNC(3, 2, rgb24tobgr15),
102 FUNC(3, 2, rgb24tobgr16),
103 FUNC(3, 3, rgb24tobgr24),
104 FUNC(3, 4, rgb24tobgr32),
105 FUNC(4, 2, rgb32tobgr15),
106 FUNC(4, 2, rgb32tobgr16),
107 FUNC(4, 3, rgb32tobgr24),
108 FUNC(4, 4, rgb32tobgr32),
109 FUNC(0, 0, NULL)
110 };
111 int width;
112 int failed=0;
113 int srcBpp=0;
114 int dstBpp=0;
115
116 if (!func_info[funcNum].func) break;
117
118 av_log(NULL, AV_LOG_INFO,".");
119 memset(srcBuffer, srcByte, SIZE);
120
121 for(width=63; width>0; width--){
122 int dstOffset;
123 for(dstOffset=128; dstOffset<196; dstOffset+=4){
124 int srcOffset;
125 memset(dstBuffer, dstByte, SIZE);
126
127 for(srcOffset=128; srcOffset<196; srcOffset+=4){
128 uint8_t *src= srcBuffer+srcOffset;
129 uint8_t *dst= dstBuffer+dstOffset;
130 char *name=NULL;
131
132 if(failed) break; //don't fill the screen with shit ...
133
134 srcBpp = func_info[funcNum].src_bpp;
135 dstBpp = func_info[funcNum].dst_bpp;
136 name = func_info[funcNum].name;
137
138 func_info[funcNum].func(src, dst, width*srcBpp);
139
140 if(!srcBpp) break;
141
142 for(i=0; i<SIZE; i++){
143 if(srcBuffer[i]!=srcByte){
144 av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n",
145 i, width, srcOffset, dstOffset, name);
146 failed=1;
147 break;
148 }
149 }
150 for(i=0; i<dstOffset; i++){
151 if(dstBuffer[i]!=dstByte){
152 av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
153 i, width, srcOffset, dstOffset, name);
154 failed=1;
155 break;
156 }
157 }
158 for(i=dstOffset + width*dstBpp; i<SIZE; i++){
159 if(dstBuffer[i]!=dstByte){
160 av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
161 i, width, srcOffset, dstOffset, name);
162 failed=1;
163 break;
164 }
165 }
166 }
167 }
168 }
169 if(failed) failedNum++;
170 else if(srcBpp) passedNum++;
171 }
172
173 av_log(NULL, AV_LOG_INFO, "\n%d converters passed, %d converters randomly overwrote memory\n", passedNum, failedNum);
174 return failedNum;
175}
diff --git a/src/plugins/ffmpeg/libswscale/internal_bfin.S b/src/plugins/ffmpeg/libswscale/internal_bfin.S
deleted file mode 100644
index fb7bda7..0000000
--- a/src/plugins/ffmpeg/libswscale/internal_bfin.S
+++ /dev/null
@@ -1,606 +0,0 @@
1/*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 * April 20, 2007
4 *
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26/*
27YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31The following calculation is used for the conversion:
32
33 r = clipz((y-oy)*cy + crv*(v-128))
34 g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
35 b = clipz((y-oy)*cy + cbu*(u-128))
36
37y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40New factorization to eliminate the truncation error which was
41occurring due to the byteop3p.
42
43
441) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
472) Scale operands up by a factor of 4 not 8 because Blackfin
48 multiplies include a shift.
49
503) Compute into the accumulators cy*yx0, cy*yx1.
51
524) Compute each of the linear equations:
53 r = clipz((y - oy) * cy + crv * (v - 128))
54
55 g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
56
57 b = clipz((y - oy) * cy + cbu * (u - 128))
58
59 Reuse of the accumulators requires that we actually multiply
60 twice once with addition and the second time with a subtraction.
61
62 Because of this we need to compute the equations in the order R B
63 then G saving the writes for B in the case of 24/32 bit color
64 formats.
65
66 API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 int dW, uint32_t *coeffs);
68
69 A B
70 --- ---
71 i2 = cb i3 = cr
72 i1 = coeff i0 = y
73
74Where coeffs have the following layout in memory.
75
76uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78coeffs is a pointer to oy.
79
80The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81replication is used to simplify the internal algorithms for the dual Mac
82architecture of BlackFin.
83
84All routines are exported with _ff_bfin_ as a symbol prefix.
85
86Rough performance gain compared against -O3:
87
882779809/1484290 187.28%
89
90which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91c/pel for the optimized implementations. Not sure why there is such a
92huge variation on the reference codes on Blackfin I guess it must have
93to do with the memory system.
94*/
95
96#define mL3 .text
97#ifdef __FDPIC__
98#define mL1 .l1.text
99#else
100#define mL1 mL3
101#endif
102#define MEM mL1
103
104#define DEFUN(fname,where,interface) \
105 .section where; \
106 .global _ff_bfin_ ## fname; \
107 .type _ff_bfin_ ## fname, STT_FUNC; \
108 .align 8; \
109 _ff_bfin_ ## fname
110
111#define DEFUN_END(fname) \
112 .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115.text
116
117#define COEFF_LEN 11*4
118#define COEFF_REL_CY_OFF 4*4
119
120#define ARG_OUT 20
121#define ARG_W 24
122#define ARG_COEFF 28
123
124DEFUN(yuv2rgb565_line,MEM,
125 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126 link 0;
127 [--sp] = (r7:4);
128 p1 = [fp+ARG_OUT];
129 r3 = [fp+ARG_W];
130
131 i0 = r0;
132 i2 = r1;
133 i3 = r2;
134
135 r0 = [fp+ARG_COEFF];
136 i1 = r0;
137 b1 = i1;
138 l1 = COEFF_LEN;
139 m0 = COEFF_REL_CY_OFF;
140 p0 = r3;
141
142 r0 = [i0++]; // 2Y
143 r1.l = w[i2++]; // 2u
144 r1.h = w[i3++]; // 2v
145 p0 = p0>>2;
146
147 lsetup (.L0565, .L1565) lc0 = p0;
148
149 /*
150 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151 r0 -- used to load 4ys
152 r1 -- used to load 2us,2vs
153 r4 -- y3,y2
154 r5 -- y1,y0
155 r6 -- u1,u0
156 r7 -- v1,v0
157 */
158 r2=[i1++]; // oy
159.L0565:
160 /*
161 rrrrrrrr gggggggg bbbbbbbb
162 5432109876543210
163 bbbbb >>3
164 gggggggg <<3
165 rrrrrrrr <<8
166 rrrrrggggggbbbbb
167 */
168 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
169 (r7,r6) = byteop16m (r1:0, r3:2) (r);
170 r5 = r5 << 2 (v); // y1,y0
171 r4 = r4 << 2 (v); // y3,y2
172 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
173 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
174 /* Y' = y*cy */
175 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
176
177 /* R = Y+ crv*(Cr-128) */
178 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
180 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
181 r2 = r2 >> 3 (v);
182 r3 = r2 & r5;
183
184 /* B = Y+ cbu*(Cb-128) */
185 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
187 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
188 r2 = r2 << 8 (v);
189 r2 = r2 & r5;
190 r3 = r3 | r2;
191
192 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
194 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
196 r2 = r2 << 3 (v);
197 r2 = r2 & r5;
198 r3 = r3 | r2;
199 [p1++]=r3 || r1=[i1++]; // cy
200
201 /* Y' = y*cy */
202
203 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
204
205 /* R = Y+ crv*(Cr-128) */
206 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
208 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
209 r2 = r2 >> 3 (v);
210 r3 = r2 & r5;
211
212 /* B = Y+ cbu*(Cb-128) */
213 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
215 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
216 r2 = r2 << 8 (v);
217 r2 = r2 & r5;
218 r3 = r3 | r2;
219
220 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
222 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223 r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
224 r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
225 r2 = r2 & r5;
226 r3 = r3 | r2;
227 [p1++]=r3 || r1.h = w[i3++]; // 2v
228.L1565: r2=[i1++]; // oy
229
230 l1 = 0;
231
232 (r7:4) = [sp++];
233 unlink;
234 rts;
235DEFUN_END(yuv2rgb565_line)
236
237DEFUN(yuv2rgb555_line,MEM,
238 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239 link 0;
240 [--sp] = (r7:4);
241 p1 = [fp+ARG_OUT];
242 r3 = [fp+ARG_W];
243
244 i0 = r0;
245 i2 = r1;
246 i3 = r2;
247
248 r0 = [fp+ARG_COEFF];
249 i1 = r0;
250 b1 = i1;
251 l1 = COEFF_LEN;
252 m0 = COEFF_REL_CY_OFF;
253 p0 = r3;
254
255 r0 = [i0++]; // 2Y
256 r1.l = w[i2++]; // 2u
257 r1.h = w[i3++]; // 2v
258 p0 = p0>>2;
259
260 lsetup (.L0555, .L1555) lc0 = p0;
261
262 /*
263 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264 r0 -- used to load 4ys
265 r1 -- used to load 2us,2vs
266 r4 -- y3,y2
267 r5 -- y1,y0
268 r6 -- u1,u0
269 r7 -- v1,v0
270 */
271 r2=[i1++]; // oy
272.L0555:
273 /*
274 rrrrrrrr gggggggg bbbbbbbb
275 5432109876543210
276 bbbbb >>3
277 gggggggg <<2
278 rrrrrrrr <<7
279 xrrrrrgggggbbbbb
280 */
281
282 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
283 (r7,r6) = byteop16m (r1:0, r3:2) (r);
284 r5 = r5 << 2 (v); // y1,y0
285 r4 = r4 << 2 (v); // y3,y2
286 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
287 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
288 /* Y' = y*cy */
289 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
290
291 /* R = Y+ crv*(Cr-128) */
292 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
294 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
295 r2 = r2 >> 3 (v);
296 r3 = r2 & r5;
297
298 /* B = Y+ cbu*(Cb-128) */
299 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
301 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
302 r2 = r2 << 7 (v);
303 r2 = r2 & r5;
304 r3 = r3 | r2;
305
306 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
308 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
310 r2 = r2 << 2 (v);
311 r2 = r2 & r5;
312 r3 = r3 | r2;
313 [p1++]=r3 || r1=[i1++]; // cy
314
315 /* Y' = y*cy */
316
317 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
318
319 /* R = Y+ crv*(Cr-128) */
320 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
322 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
323 r2 = r2 >> 3 (v);
324 r3 = r2 & r5;
325
326 /* B = Y+ cbu*(Cb-128) */
327 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
329 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
330 r2 = r2 << 7 (v);
331 r2 = r2 & r5;
332 r3 = r3 | r2;
333
334 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
336 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337 r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
338 r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
339 r2 = r2 & r5;
340 r3 = r3 | r2;
341 [p1++]=r3 || r1.h=w[i3++]; // 2v
342
343.L1555: r2=[i1++]; // oy
344
345 l1 = 0;
346
347 (r7:4) = [sp++];
348 unlink;
349 rts;
350DEFUN_END(yuv2rgb555_line)
351
352DEFUN(yuv2rgb24_line,MEM,
353 (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354 link 0;
355 [--sp] = (r7:4);
356 p1 = [fp+ARG_OUT];
357 r3 = [fp+ARG_W];
358 p2 = p1;
359 p2 += 3;
360
361 i0 = r0;
362 i2 = r1;
363 i3 = r2;
364
365 r0 = [fp+ARG_COEFF]; // coeff buffer
366 i1 = r0;
367 b1 = i1;
368 l1 = COEFF_LEN;
369 m0 = COEFF_REL_CY_OFF;
370 p0 = r3;
371
372 r0 = [i0++]; // 2Y
373 r1.l = w[i2++]; // 2u
374 r1.h = w[i3++]; // 2v
375 p0 = p0>>2;
376
377 lsetup (.L0888, .L1888) lc0 = p0;
378
379 /*
380 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381 r0 -- used to load 4ys
382 r1 -- used to load 2us,2vs
383 r4 -- y3,y2
384 r5 -- y1,y0
385 r6 -- u1,u0
386 r7 -- v1,v0
387 */
388 r2=[i1++]; // oy
389.L0888:
390 (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
391 (r7,r6) = byteop16m (r1:0, r3:2) (r);
392 r5 = r5 << 2 (v); // y1,y0
393 r4 = r4 << 2 (v); // y3,y2
394 r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
395 r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
396
397 /* Y' = y*cy */
398 a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
399
400 /* R = Y+ crv*(Cr-128) */
401 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402 a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
403 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
404 r2=r2>>16 || B[p1++]=r2;
405 B[p2++]=r2;
406
407 /* B = Y+ cbu*(Cb-128) */
408 r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409 a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
410 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
411
412 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413 a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
414 r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
416
417 r2=r2>>16 || B[p1++]=r2;
418 B[p2++]=r2;
419
420 r3=r3>>16 || B[p1++]=r3;
421 B[p2++]=r3 || r1=[i1++]; // cy
422
423 p1+=3;
424 p2+=3;
425 /* Y' = y*cy */
426 a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
427
428 /* R = Y+ crv*(Cr-128) */
429 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430 a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
431 r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
432 r2=r2>>16 || B[p1++]=r2;
433 B[p2++]=r2;
434
435 /* B = Y+ cbu*(Cb-128) */
436 r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437 a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
438 r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
439
440 /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441 a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
442 r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443 r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
444 r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
445 B[p2++]=r2 || r1.l = w[i2++]; // 2u
446 r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447 B[p2++]=r3 || r2=[i1++]; // oy
448
449 p1+=3;
450.L1888: p2+=3;
451
452 l1 = 0;
453
454 (r7:4) = [sp++];
455 unlink;
456 rts;
457DEFUN_END(yuv2rgb24_line)
458
459
460
461#define ARG_vdst 20
462#define ARG_width 24
463#define ARG_height 28
464#define ARG_lumStride 32
465#define ARG_chromStride 36
466#define ARG_srcStride 40
467
468DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469 long width, long height,
470 long lumStride, long chromStride, long srcStride)):
471 link 0;
472 [--sp] = (r7:4,p5:4);
473
474 p0 = r1; // Y top even
475
476 i2 = r2; // *u
477 r2 = [fp + ARG_vdst];
478 i3 = r2; // *v
479
480 r1 = [fp + ARG_srcStride];
481 r2 = r0 + r1;
482 r1 += -8; // i0,i1 is pre read need to correct
483 m0 = r1;
484
485 i0 = r0; // uyvy_T even
486 i1 = r2; // uyvy_B odd
487
488 p2 = [fp + ARG_lumStride];
489 p1 = p0 + p2; // Y bot odd
490
491 p5 = [fp + ARG_width];
492 p4 = [fp + ARG_height];
493 r0 = p5;
494 p4 = p4 >> 1;
495 p5 = p5 >> 2;
496
497 r2 = [fp + ARG_chromStride];
498 r0 = r0 >> 1;
499 r2 = r2 - r0;
500 m1 = r2;
501
502 /* I0,I1 - src input line pointers
503 * p0,p1 - luma output line pointers
504 * I2 - dstU
505 * I3 - dstV
506 */
507
508 lsetup (0f, 1f) lc1 = p4; // H/2
5090: r0 = [i0++] || r2 = [i1++];
510 r1 = [i0++] || r3 = [i1++];
511 r4 = byteop1p(r1:0, r3:2);
512 r5 = byteop1p(r1:0, r3:2) (r);
513 lsetup (2f, 3f) lc0 = p5; // W/4
5142: r0 = r0 >> 8(v);
515 r1 = r1 >> 8(v);
516 r2 = r2 >> 8(v);
517 r3 = r3 >> 8(v);
518 r0 = bytepack(r0, r1);
519 r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
520 r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
521 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
522 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
523 r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
5243: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
525
526 i0 += m0;
527 i1 += m0;
528 i2 += m1;
529 i3 += m1;
530 p0 = p0 + p2;
5311: p1 = p1 + p2;
532
533 (r7:4,p5:4) = [sp++];
534 unlink;
535 rts;
536DEFUN_END(uyvytoyv12)
537
538DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539 long width, long height,
540 long lumStride, long chromStride, long srcStride)):
541 link 0;
542 [--sp] = (r7:4,p5:4);
543
544 p0 = r1; // Y top even
545
546 i2 = r2; // *u
547 r2 = [fp + ARG_vdst];
548 i3 = r2; // *v
549
550 r1 = [fp + ARG_srcStride];
551 r2 = r0 + r1;
552 r1 += -8; // i0,i1 is pre read need to correct
553 m0 = r1;
554
555 i0 = r0; // uyvy_T even
556 i1 = r2; // uyvy_B odd
557
558 p2 = [fp + ARG_lumStride];
559 p1 = p0 + p2; // Y bot odd
560
561 p5 = [fp + ARG_width];
562 p4 = [fp + ARG_height];
563 r0 = p5;
564 p4 = p4 >> 1;
565 p5 = p5 >> 2;
566
567 r2 = [fp + ARG_chromStride];
568 r0 = r0 >> 1;
569 r2 = r2 - r0;
570 m1 = r2;
571
572 /* I0,I1 - src input line pointers
573 * p0,p1 - luma output line pointers
574 * I2 - dstU
575 * I3 - dstV
576 */
577
578 lsetup (0f, 1f) lc1 = p4; // H/2
5790: r0 = [i0++] || r2 = [i1++];
580 r1 = [i0++] || r3 = [i1++];
581 r4 = bytepack(r0, r1);
582 r5 = bytepack(r2, r3);
583 lsetup (2f, 3f) lc0 = p5; // W/4
5842: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
585 r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
586 r2 = r2 >> 8(v);
587 r3 = r3 >> 8(v);
588 r4 = byteop1p(r1:0, r3:2);
589 r5 = byteop1p(r1:0, r3:2) (r);
590 r6 = pack(r5.l, r4.l);
591 r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
592 r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
593 r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
5943: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
595
596 i0 += m0;
597 i1 += m0;
598 i2 += m1;
599 i3 += m1;
600 p0 = p0 + p2;
6011: p1 = p1 + p2;
602
603 (r7:4,p5:4) = [sp++];
604 unlink;
605 rts;
606DEFUN_END(yuyvtoyv12)
diff --git a/src/plugins/ffmpeg/libswscale/rgb2rgb.c b/src/plugins/ffmpeg/libswscale/rgb2rgb.c
deleted file mode 100644
index 14c4070..0000000
--- a/src/plugins/ffmpeg/libswscale/rgb2rgb.c
+++ /dev/null
@@ -1,534 +0,0 @@
1/*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 *
9 * This file is part of FFmpeg.
10 *
11 * FFmpeg is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 *
25 * The C code (not assembly, MMX, ...) of this file can be used
26 * under the LGPL license.
27 */
28#include <inttypes.h>
29#include "config.h"
30#include "libavutil/x86_cpu.h"
31#include "libavutil/bswap.h"
32#include "rgb2rgb.h"
33#include "swscale.h"
34#include "swscale_internal.h"
35
36#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
37
38void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size);
39void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
40void (*rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size);
41void (*rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size);
42void (*rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size);
43void (*rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size);
44void (*rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size);
45void (*rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size);
46void (*rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size);
47void (*rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size);
48void (*rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size);
49void (*rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size);
50//void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
51void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
52void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
53void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
54void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
55//void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
56void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
57void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
58
59void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
60 long width, long height,
61 long lumStride, long chromStride, long dstStride);
62void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
63 long width, long height,
64 long lumStride, long chromStride, long dstStride);
65void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
66 long width, long height,
67 long lumStride, long chromStride, long dstStride);
68void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
69 long width, long height,
70 long lumStride, long chromStride, long srcStride);
71void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
72 long width, long height,
73 long lumStride, long chromStride, long srcStride);
74void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height,
75 long srcStride, long dstStride);
76void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst,
77 long width, long height, long src1Stride,
78 long src2Stride, long dstStride);
79void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
80 uint8_t *dst1, uint8_t *dst2,
81 long width, long height,
82 long srcStride1, long srcStride2,
83 long dstStride1, long dstStride2);
84void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
85 uint8_t *dst,
86 long width, long height,
87 long srcStride1, long srcStride2,
88 long srcStride3, long dstStride);
89
90#if defined(ARCH_X86) && defined(CONFIG_GPL)
91DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
92DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
93DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
94DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
95DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
96DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
97DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
98DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
99DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
100DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
101DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
102DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
103DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
104DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
105DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
106DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
107DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
108DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
109DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
110DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
111DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
112DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
113DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
114DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
115#define mask16b mask15b
116DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
117DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
118DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
119DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
120DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
121DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
122DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
123DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
124
125#if 0
126static volatile uint64_t __attribute__((aligned(8))) b5Dither;
127static volatile uint64_t __attribute__((aligned(8))) g5Dither;
128static volatile uint64_t __attribute__((aligned(8))) g6Dither;
129static volatile uint64_t __attribute__((aligned(8))) r5Dither;
130
131static uint64_t __attribute__((aligned(8))) dither4[2]={
132 0x0103010301030103LL,
133 0x0200020002000200LL,};
134
135static uint64_t __attribute__((aligned(8))) dither8[2]={
136 0x0602060206020602LL,
137 0x0004000400040004LL,};
138#endif
139#endif /* defined(ARCH_X86) */
140
141#define RGB2YUV_SHIFT 8
142#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
143#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
144#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
145#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
146#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
147#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
148#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
149#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
150#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
151
152//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
153//plain C versions
154#undef HAVE_MMX
155#undef HAVE_MMX2
156#undef HAVE_3DNOW
157#undef HAVE_SSE2
158#define RENAME(a) a ## _C
159#include "rgb2rgb_template.c"
160
161#if defined(ARCH_X86) && defined(CONFIG_GPL)
162
163//MMX versions
164#undef RENAME
165#define HAVE_MMX
166#undef HAVE_MMX2
167#undef HAVE_3DNOW
168#undef HAVE_SSE2
169#define RENAME(a) a ## _MMX
170#include "rgb2rgb_template.c"
171
172//MMX2 versions
173#undef RENAME
174#define HAVE_MMX
175#define HAVE_MMX2
176#undef HAVE_3DNOW
177#undef HAVE_SSE2
178#define RENAME(a) a ## _MMX2
179#include "rgb2rgb_template.c"
180
181//3DNOW versions
182#undef RENAME
183#define HAVE_MMX
184#undef HAVE_MMX2
185#define HAVE_3DNOW
186#undef HAVE_SSE2
187#define RENAME(a) a ## _3DNOW
188#include "rgb2rgb_template.c"
189
190#endif //ARCH_X86 || ARCH_X86_64
191
192/*
193 RGB15->RGB16 original by Strepto/Astral
194 ported to gcc & bugfixed : A'rpi
195 MMX2, 3DNOW optimization by Nick Kurshev
196 32-bit C version, and and&add trick by Michael Niedermayer
197*/
198
199void sws_rgb2rgb_init(int flags){
200#if (defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX)) && defined(CONFIG_GPL)
201 if (flags & SWS_CPU_CAPS_MMX2)
202 rgb2rgb_init_MMX2();
203 else if (flags & SWS_CPU_CAPS_3DNOW)
204 rgb2rgb_init_3DNOW();
205 else if (flags & SWS_CPU_CAPS_MMX)
206 rgb2rgb_init_MMX();
207 else
208#endif /* defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX) */
209 rgb2rgb_init_C();
210}
211
212/**
213 * Palette is assumed to contain BGR32.
214 */
215void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
216{
217 long i;
218
219/*
220 for (i=0; i<num_pixels; i++)
221 ((unsigned *)dst)[i] = ((unsigned *)palette)[src[i]];
222*/
223
224 for (i=0; i<num_pixels; i++)
225 {
226 #ifdef WORDS_BIGENDIAN
227 dst[3]= palette[src[i]*4+2];
228 dst[2]= palette[src[i]*4+1];
229 dst[1]= palette[src[i]*4+0];
230 #else
231 //FIXME slow?
232 dst[0]= palette[src[i]*4+2];
233 dst[1]= palette[src[i]*4+1];
234 dst[2]= palette[src[i]*4+0];
235 //dst[3]= 0; /* do we need this cleansing? */
236 #endif
237 dst+= 4;
238 }
239}
240
241void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
242{
243 long i;
244 for (i=0; i<num_pixels; i++)
245 {
246 #ifdef WORDS_BIGENDIAN
247 dst[3]= palette[src[i]*4+0];
248 dst[2]= palette[src[i]*4+1];
249 dst[1]= palette[src[i]*4+2];
250 #else
251 //FIXME slow?
252 dst[0]= palette[src[i]*4+0];
253 dst[1]= palette[src[i]*4+1];
254 dst[2]= palette[src[i]*4+2];
255 //dst[3]= 0; /* do we need this cleansing? */
256 #endif
257
258 dst+= 4;
259 }
260}
261
262/**
263 * Palette is assumed to contain BGR32.
264 */
265void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
266{
267 long i;
268/*
269 Writes 1 byte too much and might cause alignment issues on some architectures?
270 for (i=0; i<num_pixels; i++)
271 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
272*/
273 for (i=0; i<num_pixels; i++)
274 {
275 //FIXME slow?
276 dst[0]= palette[src[i]*4+2];
277 dst[1]= palette[src[i]*4+1];
278 dst[2]= palette[src[i]*4+0];
279 dst+= 3;
280 }
281}
282
283void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
284{
285 long i;
286/*
287 Writes 1 byte too much and might cause alignment issues on some architectures?
288 for (i=0; i<num_pixels; i++)
289 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
290*/
291 for (i=0; i<num_pixels; i++)
292 {
293 //FIXME slow?
294 dst[0]= palette[src[i]*4+0];
295 dst[1]= palette[src[i]*4+1];
296 dst[2]= palette[src[i]*4+2];
297 dst+= 3;
298 }
299}
300
301/**
302 * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
303 */
304void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
305{
306 long i;
307 for (i=0; i<num_pixels; i++)
308 ((uint16_t *)dst)[i] = ((const uint16_t *)palette)[src[i]];
309}
310void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
311{
312 long i;
313 for (i=0; i<num_pixels; i++)
314 ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]);
315}
316
317/**
318 * Palette is assumed to contain BGR15, see rgb32to15 to convert the palette.
319 */
320void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
321{
322 long i;
323 for (i=0; i<num_pixels; i++)
324 ((uint16_t *)dst)[i] = ((const uint16_t *)palette)[src[i]];
325}
326void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
327{
328 long i;
329 for (i=0; i<num_pixels; i++)
330 ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]);
331}
332
333void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
334{
335 long i;
336 long num_pixels = src_size >> 2;
337 for (i=0; i<num_pixels; i++)
338 {
339 #ifdef WORDS_BIGENDIAN
340 /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
341 dst[3*i + 0] = src[4*i + 1];
342 dst[3*i + 1] = src[4*i + 2];
343 dst[3*i + 2] = src[4*i + 3];
344 #else
345 dst[3*i + 0] = src[4*i + 2];
346 dst[3*i + 1] = src[4*i + 1];
347 dst[3*i + 2] = src[4*i + 0];
348 #endif
349 }
350}
351
352void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
353{
354 long i;
355 for (i=0; 3*i<src_size; i++)
356 {
357 #ifdef WORDS_BIGENDIAN
358 /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */
359 dst[4*i + 0] = 0;
360 dst[4*i + 1] = src[3*i + 0];
361 dst[4*i + 2] = src[3*i + 1];
362 dst[4*i + 3] = src[3*i + 2];
363 #else
364 dst[4*i + 0] = src[3*i + 2];
365 dst[4*i + 1] = src[3*i + 1];
366 dst[4*i + 2] = src[3*i + 0];
367 dst[4*i + 3] = 0;
368 #endif
369 }
370}
371
372void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
373{
374 const uint16_t *end;
375 uint8_t *d = dst;
376 const uint16_t *s = (const uint16_t *)src;
377 end = s + src_size/2;
378 while (s < end)
379 {
380 register uint16_t bgr;
381 bgr = *s++;
382 #ifdef WORDS_BIGENDIAN
383 *d++ = 0;
384 *d++ = (bgr&0x1F)<<3;
385 *d++ = (bgr&0x7E0)>>3;
386 *d++ = (bgr&0xF800)>>8;
387 #else
388 *d++ = (bgr&0xF800)>>8;
389 *d++ = (bgr&0x7E0)>>3;
390 *d++ = (bgr&0x1F)<<3;
391 *d++ = 0;
392 #endif
393 }
394}
395
396void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
397{
398 const uint16_t *end;
399 uint8_t *d = dst;
400 const uint16_t *s = (const uint16_t *)src;
401 end = s + src_size/2;
402 while (s < end)
403 {
404 register uint16_t bgr;
405 bgr = *s++;
406 *d++ = (bgr&0xF800)>>8;
407 *d++ = (bgr&0x7E0)>>3;
408 *d++ = (bgr&0x1F)<<3;
409 }
410}
411
412void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
413{
414 long i;
415 long num_pixels = src_size >> 1;
416
417 for (i=0; i<num_pixels; i++)
418 {
419 unsigned b,g,r;
420 register uint16_t rgb;
421 rgb = src[2*i];
422 r = rgb&0x1F;
423 g = (rgb&0x7E0)>>5;
424 b = (rgb&0xF800)>>11;
425 dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11);
426 }
427}
428
429void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
430{
431 long i;
432 long num_pixels = src_size >> 1;
433
434 for (i=0; i<num_pixels; i++)
435 {
436 unsigned b,g,r;
437 register uint16_t rgb;
438 rgb = src[2*i];
439 r = rgb&0x1F;
440 g = (rgb&0x7E0)>>5;
441 b = (rgb&0xF800)>>11;
442 dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10);
443 }
444}
445
446void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
447{
448 const uint16_t *end;
449 uint8_t *d = dst;
450 const uint16_t *s = (const uint16_t *)src;
451 end = s + src_size/2;
452 while (s < end)
453 {
454 register uint16_t bgr;
455 bgr = *s++;
456 #ifdef WORDS_BIGENDIAN
457 *d++ = 0;
458 *d++ = (bgr&0x1F)<<3;
459 *d++ = (bgr&0x3E0)>>2;
460 *d++ = (bgr&0x7C00)>>7;
461 #else
462 *d++ = (bgr&0x7C00)>>7;
463 *d++ = (bgr&0x3E0)>>2;
464 *d++ = (bgr&0x1F)<<3;
465 *d++ = 0;
466 #endif
467 }
468}
469
470void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
471{
472 const uint16_t *end;
473 uint8_t *d = dst;
474 const uint16_t *s = (const uint16_t *)src;
475 end = s + src_size/2;
476 while (s < end)
477 {
478 register uint16_t bgr;
479 bgr = *s++;
480 *d++ = (bgr&0x7C00)>>7;
481 *d++ = (bgr&0x3E0)>>2;
482 *d++ = (bgr&0x1F)<<3;
483 }
484}
485
486void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
487{
488 long i;
489 long num_pixels = src_size >> 1;
490
491 for (i=0; i<num_pixels; i++)
492 {
493 unsigned b,g,r;
494 register uint16_t rgb;
495 rgb = src[2*i];
496 r = rgb&0x1F;
497 g = (rgb&0x3E0)>>5;
498 b = (rgb&0x7C00)>>10;
499 dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11);
500 }
501}
502
503void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
504{
505 long i;
506 long num_pixels = src_size >> 1;
507
508 for (i=0; i<num_pixels; i++)
509 {
510 unsigned b,g,r;
511 register uint16_t rgb;
512 rgb = src[2*i];
513 r = rgb&0x1F;
514 g = (rgb&0x3E0)>>5;
515 b = (rgb&0x7C00)>>10;
516 dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10);
517 }
518}
519
520void rgb8tobgr8(const uint8_t *src, uint8_t *dst, long src_size)
521{
522 long i;
523 long num_pixels = src_size;
524 for (i=0; i<num_pixels; i++)
525 {
526 unsigned b,g,r;
527 register uint8_t rgb;
528 rgb = src[i];
529 r = (rgb&0x07);
530 g = (rgb&0x38)>>3;
531 b = (rgb&0xC0)>>6;
532 dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6);
533 }
534}
diff --git a/src/plugins/ffmpeg/libswscale/rgb2rgb.h b/src/plugins/ffmpeg/libswscale/rgb2rgb.h
deleted file mode 100644
index f2697c6..0000000
--- a/src/plugins/ffmpeg/libswscale/rgb2rgb.h
+++ /dev/null
@@ -1,142 +0,0 @@
1/*
2 * software RGB to RGB converter
3 * pluralize by Software PAL8 to RGB converter
4 * Software YUV to YUV converter
5 * Software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 *
9 * This file is part of FFmpeg.
10 *
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26#ifndef FFMPEG_RGB2RGB_H
27#define FFMPEG_RGB2RGB_H
28
29#include <inttypes.h>
30
31/* A full collection of RGB to RGB(BGR) converters */
32extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size);
33extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size);
34extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size);
35extern void (*rgb32to24) (const uint8_t *src, uint8_t *dst, long src_size);
36extern void (*rgb32to16) (const uint8_t *src, uint8_t *dst, long src_size);
37extern void (*rgb32to15) (const uint8_t *src, uint8_t *dst, long src_size);
38extern void (*rgb15to16) (const uint8_t *src, uint8_t *dst, long src_size);
39extern void (*rgb15to24) (const uint8_t *src, uint8_t *dst, long src_size);
40extern void (*rgb15to32) (const uint8_t *src, uint8_t *dst, long src_size);
41extern void (*rgb16to15) (const uint8_t *src, uint8_t *dst, long src_size);
42extern void (*rgb16to24) (const uint8_t *src, uint8_t *dst, long src_size);
43extern void (*rgb16to32) (const uint8_t *src, uint8_t *dst, long src_size);
44extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
45extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
46extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
47extern void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
48extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
49extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
50
51extern void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
52extern void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size);
53extern void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
54extern void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size);
55extern void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
56extern void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
57extern void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
58extern void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size);
59extern void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
60extern void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
61extern void rgb8tobgr8 (const uint8_t *src, uint8_t *dst, long src_size);
62
63
64extern void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
65extern void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
66extern void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
67extern void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
68extern void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
69extern void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
70extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
71extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
72
73/**
74 * Height should be a multiple of 2 and width should be a multiple of 16.
75 * (If this is a problem for anyone then tell me, and I will fix it.)
76 * Chrominance data is only taken from every second line, others are ignored.
77 * FIXME: Write HQ version.
78 */
79//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
80
81/**
82 * Height should be a multiple of 2 and width should be a multiple of 16.
83 * (If this is a problem for anyone then tell me, and I will fix it.)
84 */
85extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
86 long width, long height,
87 long lumStride, long chromStride, long dstStride);
88
89/**
90 * Width should be a multiple of 16.
91 */
92extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
93 long width, long height,
94 long lumStride, long chromStride, long dstStride);
95
96/**
97 * Height should be a multiple of 2 and width should be a multiple of 16.
98 * (If this is a problem for anyone then tell me, and I will fix it.)
99 */
100extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
101 long width, long height,
102 long lumStride, long chromStride, long srcStride);
103
104/**
105 * Height should be a multiple of 2 and width should be a multiple of 16.
106 * (If this is a problem for anyone then tell me, and I will fix it.)
107 */
108extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
109 long width, long height,
110 long lumStride, long chromStride, long dstStride);
111
112/**
113 * Height should be a multiple of 2 and width should be a multiple of 2.
114 * (If this is a problem for anyone then tell me, and I will fix it.)
115 * Chrominance data is only taken from every second line, others are ignored.
116 * FIXME: Write HQ version.
117 */
118extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
119 long width, long height,
120 long lumStride, long chromStride, long srcStride);
121extern void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height,
122 long srcStride, long dstStride);
123
124extern void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst,
125 long width, long height, long src1Stride,
126 long src2Stride, long dstStride);
127
128extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
129 uint8_t *dst1, uint8_t *dst2,
130 long width, long height,
131 long srcStride1, long srcStride2,
132 long dstStride1, long dstStride2);
133
134extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
135 uint8_t *dst,
136 long width, long height,
137 long srcStride1, long srcStride2,
138 long srcStride3, long dstStride);
139
140void sws_rgb2rgb_init(int flags);
141
142#endif /* FFMPEG_RGB2RGB_H */
diff --git a/src/plugins/ffmpeg/libswscale/rgb2rgb_template.c b/src/plugins/ffmpeg/libswscale/rgb2rgb_template.c
deleted file mode 100644
index ffbf2c7..0000000
--- a/src/plugins/ffmpeg/libswscale/rgb2rgb_template.c
+++ /dev/null
@@ -1,2738 +0,0 @@
1/*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 *
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
28 */
29
30#include <stddef.h>
31#include <inttypes.h> /* for __WORDSIZE */
32
33#ifndef __WORDSIZE
34// #warning You have a misconfigured system and will probably lose performance!
35#define __WORDSIZE MP_WORDSIZE
36#endif
37
38#undef PREFETCH
39#undef MOVNTQ
40#undef EMMS
41#undef SFENCE
42#undef MMREG_SIZE
43#undef PREFETCHW
44#undef PAVGB
45
46#ifdef HAVE_SSE2
47#define MMREG_SIZE 16
48#else
49#define MMREG_SIZE 8
50#endif
51
52#ifdef HAVE_3DNOW
53#define PREFETCH "prefetch"
54#define PREFETCHW "prefetchw"
55#define PAVGB "pavgusb"
56#elif defined (HAVE_MMX2)
57#define PREFETCH "prefetchnta"
58#define PREFETCHW "prefetcht0"
59#define PAVGB "pavgb"
60#else
61#ifdef __APPLE__
62#define PREFETCH "#"
63#define PREFETCHW "#"
64#else
65#define PREFETCH " # nop"
66#define PREFETCHW " # nop"
67#endif
68#endif
69
70#ifdef HAVE_3DNOW
71/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
72#define EMMS "femms"
73#else
74#define EMMS "emms"
75#endif
76
77#ifdef HAVE_MMX2
78#define MOVNTQ "movntq"
79#define SFENCE "sfence"
80#else
81#define MOVNTQ "movq"
82#define SFENCE " # nop"
83#endif
84
85static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size)
86{
87 uint8_t *dest = dst;
88 const uint8_t *s = src;
89 const uint8_t *end;
90 #ifdef HAVE_MMX
91 const uint8_t *mm_end;
92 #endif
93 end = s + src_size;
94 #ifdef HAVE_MMX
95 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
96 mm_end = end - 23;
97 asm volatile("movq %0, %%mm7"::"m"(mask32):"memory");
98 while (s < mm_end)
99 {
100 asm volatile(
101 PREFETCH" 32%1 \n\t"
102 "movd %1, %%mm0 \n\t"
103 "punpckldq 3%1, %%mm0 \n\t"
104 "movd 6%1, %%mm1 \n\t"
105 "punpckldq 9%1, %%mm1 \n\t"
106 "movd 12%1, %%mm2 \n\t"
107 "punpckldq 15%1, %%mm2 \n\t"
108 "movd 18%1, %%mm3 \n\t"
109 "punpckldq 21%1, %%mm3 \n\t"
110 "pand %%mm7, %%mm0 \n\t"
111 "pand %%mm7, %%mm1 \n\t"
112 "pand %%mm7, %%mm2 \n\t"
113 "pand %%mm7, %%mm3 \n\t"
114 MOVNTQ" %%mm0, %0 \n\t"
115 MOVNTQ" %%mm1, 8%0 \n\t"
116 MOVNTQ" %%mm2, 16%0 \n\t"
117 MOVNTQ" %%mm3, 24%0"
118 :"=m"(*dest)
119 :"m"(*s)
120 :"memory");
121 dest += 32;
122 s += 24;
123 }
124 asm volatile(SFENCE:::"memory");
125 asm volatile(EMMS:::"memory");
126 #endif
127 while (s < end)
128 {
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131 *dest++ = 0;
132 *dest++ = s[2];
133 *dest++ = s[1];
134 *dest++ = s[0];
135 s+=3;
136 #else
137 *dest++ = *s++;
138 *dest++ = *s++;
139 *dest++ = *s++;
140 *dest++ = 0;
141 #endif
142 }
143}
144
145static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size)
146{
147 uint8_t *dest = dst;
148 const uint8_t *s = src;
149 const uint8_t *end;
150#ifdef HAVE_MMX
151 const uint8_t *mm_end;
152#endif
153 end = s + src_size;
154#ifdef HAVE_MMX
155 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
156 mm_end = end - 31;
157 while (s < mm_end)
158 {
159 asm volatile(
160 PREFETCH" 32%1 \n\t"
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
185
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
199
200 MOVNTQ" %%mm0, %0 \n\t"
201 MOVNTQ" %%mm1, 8%0 \n\t"
202 MOVNTQ" %%mm4, 16%0"
203 :"=m"(*dest)
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206 :"memory");
207 dest += 24;
208 s += 32;
209 }
210 asm volatile(SFENCE:::"memory");
211 asm volatile(EMMS:::"memory");
212#endif
213 while (s < end)
214 {
215#ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
217 s++;
218 dest[2] = *s++;
219 dest[1] = *s++;
220 dest[0] = *s++;
221 dest += 3;
222#else
223 *dest++ = *s++;
224 *dest++ = *s++;
225 *dest++ = *s++;
226 s++;
227#endif
228 }
229}
230
231/*
232 original by Strepto/Astral
233 ported to gcc & bugfixed: A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32-bit C version, and and&add trick by Michael Niedermayer
236*/
237static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
238{
239 register const uint8_t* s=src;
240 register uint8_t* d=dst;
241 register const uint8_t *end;
242 const uint8_t *mm_end;
243 end = s + src_size;
244#ifdef HAVE_MMX
245 asm volatile(PREFETCH" %0"::"m"(*s));
246 asm volatile("movq %0, %%mm4"::"m"(mask15s));
247 mm_end = end - 15;
248 while (s<mm_end)
249 {
250 asm volatile(
251 PREFETCH" 32%1 \n\t"
252 "movq %1, %%mm0 \n\t"
253 "movq 8%1, %%mm2 \n\t"
254 "movq %%mm0, %%mm1 \n\t"
255 "movq %%mm2, %%mm3 \n\t"
256 "pand %%mm4, %%mm0 \n\t"
257 "pand %%mm4, %%mm2 \n\t"
258 "paddw %%mm1, %%mm0 \n\t"
259 "paddw %%mm3, %%mm2 \n\t"
260 MOVNTQ" %%mm0, %0 \n\t"
261 MOVNTQ" %%mm2, 8%0"
262 :"=m"(*d)
263 :"m"(*s)
264 );
265 d+=16;
266 s+=16;
267 }
268 asm volatile(SFENCE:::"memory");
269 asm volatile(EMMS:::"memory");
270#endif
271 mm_end = end - 3;
272 while (s < mm_end)
273 {
274 register unsigned x= *((const uint32_t *)s);
275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276 d+=4;
277 s+=4;
278 }
279 if (s < end)
280 {
281 register unsigned short x= *((const uint16_t *)s);
282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
283 }
284}
285
286static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
287{
288 register const uint8_t* s=src;
289 register uint8_t* d=dst;
290 register const uint8_t *end;
291 const uint8_t *mm_end;
292 end = s + src_size;
293#ifdef HAVE_MMX
294 asm volatile(PREFETCH" %0"::"m"(*s));
295 asm volatile("movq %0, %%mm7"::"m"(mask15rg));
296 asm volatile("movq %0, %%mm6"::"m"(mask15b));
297 mm_end = end - 15;
298 while (s<mm_end)
299 {
300 asm volatile(
301 PREFETCH" 32%1 \n\t"
302 "movq %1, %%mm0 \n\t"
303 "movq 8%1, %%mm2 \n\t"
304 "movq %%mm0, %%mm1 \n\t"
305 "movq %%mm2, %%mm3 \n\t"
306 "psrlq $1, %%mm0 \n\t"
307 "psrlq $1, %%mm2 \n\t"
308 "pand %%mm7, %%mm0 \n\t"
309 "pand %%mm7, %%mm2 \n\t"
310 "pand %%mm6, %%mm1 \n\t"
311 "pand %%mm6, %%mm3 \n\t"
312 "por %%mm1, %%mm0 \n\t"
313 "por %%mm3, %%mm2 \n\t"
314 MOVNTQ" %%mm0, %0 \n\t"
315 MOVNTQ" %%mm2, 8%0"
316 :"=m"(*d)
317 :"m"(*s)
318 );
319 d+=16;
320 s+=16;
321 }
322 asm volatile(SFENCE:::"memory");
323 asm volatile(EMMS:::"memory");
324#endif
325 mm_end = end - 3;
326 while (s < mm_end)
327 {
328 register uint32_t x= *((const uint32_t*)s);
329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330 s+=4;
331 d+=4;
332 }
333 if (s < end)
334 {
335 register uint16_t x= *((const uint16_t*)s);
336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337 s+=2;
338 d+=2;
339 }
340}
341
342static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
343{
344 const uint8_t *s = src;
345 const uint8_t *end;
346#ifdef HAVE_MMX
347 const uint8_t *mm_end;
348#endif
349 uint16_t *d = (uint16_t *)dst;
350 end = s + src_size;
351#ifdef HAVE_MMX
352 mm_end = end - 15;
353#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
354 asm volatile(
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
358 "jmp 2f \n\t"
359 ASMALIGN(4)
360 "1: \n\t"
361 PREFETCH" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ" %%mm0, (%0) \n\t"
380 "add $16, %1 \n\t"
381 "add $8, %0 \n\t"
382 "2: \n\t"
383 "cmp %2, %1 \n\t"
384 " jb 1b \n\t"
385 : "+r" (d), "+r"(s)
386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
387 );
388#else
389 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
390 asm volatile(
391 "movq %0, %%mm7 \n\t"
392 "movq %1, %%mm6 \n\t"
393 ::"m"(red_16mask),"m"(green_16mask));
394 while (s < mm_end)
395 {
396 asm volatile(
397 PREFETCH" 32%1 \n\t"
398 "movd %1, %%mm0 \n\t"
399 "movd 4%1, %%mm3 \n\t"
400 "punpckldq 8%1, %%mm0 \n\t"
401 "punpckldq 12%1, %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm0, %%mm2 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "movq %%mm3, %%mm5 \n\t"
406 "psrlq $3, %%mm0 \n\t"
407 "psrlq $3, %%mm3 \n\t"
408 "pand %2, %%mm0 \n\t"
409 "pand %2, %%mm3 \n\t"
410 "psrlq $5, %%mm1 \n\t"
411 "psrlq $5, %%mm4 \n\t"
412 "pand %%mm6, %%mm1 \n\t"
413 "pand %%mm6, %%mm4 \n\t"
414 "psrlq $8, %%mm2 \n\t"
415 "psrlq $8, %%mm5 \n\t"
416 "pand %%mm7, %%mm2 \n\t"
417 "pand %%mm7, %%mm5 \n\t"
418 "por %%mm1, %%mm0 \n\t"
419 "por %%mm4, %%mm3 \n\t"
420 "por %%mm2, %%mm0 \n\t"
421 "por %%mm5, %%mm3 \n\t"
422 "psllq $16, %%mm3 \n\t"
423 "por %%mm3, %%mm0 \n\t"
424 MOVNTQ" %%mm0, %0 \n\t"
425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
426 d += 4;
427 s += 16;
428 }
429#endif
430 asm volatile(SFENCE:::"memory");
431 asm volatile(EMMS:::"memory");
432#endif
433 while (s < end)
434 {
435 register int rgb = *(const uint32_t*)s; s += 4;
436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
437 }
438}
439
440static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
441{
442 const uint8_t *s = src;
443 const uint8_t *end;
444#ifdef HAVE_MMX
445 const uint8_t *mm_end;
446#endif
447 uint16_t *d = (uint16_t *)dst;
448 end = s + src_size;
449#ifdef HAVE_MMX
450 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
451 asm volatile(
452 "movq %0, %%mm7 \n\t"
453 "movq %1, %%mm6 \n\t"
454 ::"m"(red_16mask),"m"(green_16mask));
455 mm_end = end - 15;
456 while (s < mm_end)
457 {
458 asm volatile(
459 PREFETCH" 32%1 \n\t"
460 "movd %1, %%mm0 \n\t"
461 "movd 4%1, %%mm3 \n\t"
462 "punpckldq 8%1, %%mm0 \n\t"
463 "punpckldq 12%1, %%mm3 \n\t"
464 "movq %%mm0, %%mm1 \n\t"
465 "movq %%mm0, %%mm2 \n\t"
466 "movq %%mm3, %%mm4 \n\t"
467 "movq %%mm3, %%mm5 \n\t"
468 "psllq $8, %%mm0 \n\t"
469 "psllq $8, %%mm3 \n\t"
470 "pand %%mm7, %%mm0 \n\t"
471 "pand %%mm7, %%mm3 \n\t"
472 "psrlq $5, %%mm1 \n\t"
473 "psrlq $5, %%mm4 \n\t"
474 "pand %%mm6, %%mm1 \n\t"
475 "pand %%mm6, %%mm4 \n\t"
476 "psrlq $19, %%mm2 \n\t"
477 "psrlq $19, %%mm5 \n\t"
478 "pand %2, %%mm2 \n\t"
479 "pand %2, %%mm5 \n\t"
480 "por %%mm1, %%mm0 \n\t"
481 "por %%mm4, %%mm3 \n\t"
482 "por %%mm2, %%mm0 \n\t"
483 "por %%mm5, %%mm3 \n\t"
484 "psllq $16, %%mm3 \n\t"
485 "por %%mm3, %%mm0 \n\t"
486 MOVNTQ" %%mm0, %0 \n\t"
487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
488 d += 4;
489 s += 16;
490 }
491 asm volatile(SFENCE:::"memory");
492 asm volatile(EMMS:::"memory");
493#endif
494 while (s < end)
495 {
496 register int rgb = *(const uint32_t*)s; s += 4;
497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
498 }
499}
500
501static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
502{
503 const uint8_t *s = src;
504 const uint8_t *end;
505#ifdef HAVE_MMX
506 const uint8_t *mm_end;
507#endif
508 uint16_t *d = (uint16_t *)dst;
509 end = s + src_size;
510#ifdef HAVE_MMX
511 mm_end = end - 15;
512#if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
513 asm volatile(
514 "movq %3, %%mm5 \n\t"
515 "movq %4, %%mm6 \n\t"
516 "movq %5, %%mm7 \n\t"
517 "jmp 2f \n\t"
518 ASMALIGN(4)
519 "1: \n\t"
520 PREFETCH" 32(%1) \n\t"
521 "movd (%1), %%mm0 \n\t"
522 "movd 4(%1), %%mm3 \n\t"
523 "punpckldq 8(%1), %%mm0 \n\t"
524 "punpckldq 12(%1), %%mm3 \n\t"
525 "movq %%mm0, %%mm1 \n\t"
526 "movq %%mm3, %%mm4 \n\t"
527 "pand %%mm6, %%mm0 \n\t"
528 "pand %%mm6, %%mm3 \n\t"
529 "pmaddwd %%mm7, %%mm0 \n\t"
530 "pmaddwd %%mm7, %%mm3 \n\t"
531 "pand %%mm5, %%mm1 \n\t"
532 "pand %%mm5, %%mm4 \n\t"
533 "por %%mm1, %%mm0 \n\t"
534 "por %%mm4, %%mm3 \n\t"
535 "psrld $6, %%mm0 \n\t"
536 "pslld $10, %%mm3 \n\t"
537 "por %%mm3, %%mm0 \n\t"
538 MOVNTQ" %%mm0, (%0) \n\t"
539 "add $16, %1 \n\t"
540 "add $8, %0 \n\t"
541 "2: \n\t"
542 "cmp %2, %1 \n\t"
543 " jb 1b \n\t"
544 : "+r" (d), "+r"(s)
545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
546 );
547#else
548 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
549 asm volatile(
550 "movq %0, %%mm7 \n\t"
551 "movq %1, %%mm6 \n\t"
552 ::"m"(red_15mask),"m"(green_15mask));
553 while (s < mm_end)
554 {
555 asm volatile(
556 PREFETCH" 32%1 \n\t"
557 "movd %1, %%mm0 \n\t"
558 "movd 4%1, %%mm3 \n\t"
559 "punpckldq 8%1, %%mm0 \n\t"
560 "punpckldq 12%1, %%mm3 \n\t"
561 "movq %%mm0, %%mm1 \n\t"
562 "movq %%mm0, %%mm2 \n\t"
563 "movq %%mm3, %%mm4 \n\t"
564 "movq %%mm3, %%mm5 \n\t"
565 "psrlq $3, %%mm0 \n\t"
566 "psrlq $3, %%mm3 \n\t"
567 "pand %2, %%mm0 \n\t"
568 "pand %2, %%mm3 \n\t"
569 "psrlq $6, %%mm1 \n\t"
570 "psrlq $6, %%mm4 \n\t"
571 "pand %%mm6, %%mm1 \n\t"
572 "pand %%mm6, %%mm4 \n\t"
573 "psrlq $9, %%mm2 \n\t"
574 "psrlq $9, %%mm5 \n\t"
575 "pand %%mm7, %%mm2 \n\t"
576 "pand %%mm7, %%mm5 \n\t"
577 "por %%mm1, %%mm0 \n\t"
578 "por %%mm4, %%mm3 \n\t"
579 "por %%mm2, %%mm0 \n\t"
580 "por %%mm5, %%mm3 \n\t"
581 "psllq $16, %%mm3 \n\t"
582 "por %%mm3, %%mm0 \n\t"
583 MOVNTQ" %%mm0, %0 \n\t"
584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
585 d += 4;
586 s += 16;
587 }
588#endif
589 asm volatile(SFENCE:::"memory");
590 asm volatile(EMMS:::"memory");
591#endif
592 while (s < end)
593 {
594 register int rgb = *(const uint32_t*)s; s += 4;
595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
596 }
597}
598
599static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
600{
601 const uint8_t *s = src;
602 const uint8_t *end;
603#ifdef HAVE_MMX
604 const uint8_t *mm_end;
605#endif
606 uint16_t *d = (uint16_t *)dst;
607 end = s + src_size;
608#ifdef HAVE_MMX
609 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
610 asm volatile(
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
614 mm_end = end - 15;
615 while (s < mm_end)
616 {
617 asm volatile(
618 PREFETCH" 32%1 \n\t"
619 "movd %1, %%mm0 \n\t"
620 "movd 4%1, %%mm3 \n\t"
621 "punpckldq 8%1, %%mm0 \n\t"
622 "punpckldq 12%1, %%mm3 \n\t"
623 "movq %%mm0, %%mm1 \n\t"
624 "movq %%mm0, %%mm2 \n\t"
625 "movq %%mm3, %%mm4 \n\t"
626 "movq %%mm3, %%mm5 \n\t"
627 "psllq $7, %%mm0 \n\t"
628 "psllq $7, %%mm3 \n\t"
629 "pand %%mm7, %%mm0 \n\t"
630 "pand %%mm7, %%mm3 \n\t"
631 "psrlq $6, %%mm1 \n\t"
632 "psrlq $6, %%mm4 \n\t"
633 "pand %%mm6, %%mm1 \n\t"
634 "pand %%mm6, %%mm4 \n\t"
635 "psrlq $19, %%mm2 \n\t"
636 "psrlq $19, %%mm5 \n\t"
637 "pand %2, %%mm2 \n\t"
638 "pand %2, %%mm5 \n\t"
639 "por %%mm1, %%mm0 \n\t"
640 "por %%mm4, %%mm3 \n\t"
641 "por %%mm2, %%mm0 \n\t"
642 "por %%mm5, %%mm3 \n\t"
643 "psllq $16, %%mm3 \n\t"
644 "por %%mm3, %%mm0 \n\t"
645 MOVNTQ" %%mm0, %0 \n\t"
646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
647 d += 4;
648 s += 16;
649 }
650 asm volatile(SFENCE:::"memory");
651 asm volatile(EMMS:::"memory");
652#endif
653 while (s < end)
654 {
655 register int rgb = *(const uint32_t*)s; s += 4;
656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
657 }
658}
659
660static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
661{
662 const uint8_t *s = src;
663 const uint8_t *end;
664#ifdef HAVE_MMX
665 const uint8_t *mm_end;
666#endif
667 uint16_t *d = (uint16_t *)dst;
668 end = s + src_size;
669#ifdef HAVE_MMX
670 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
671 asm volatile(
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::"m"(red_16mask),"m"(green_16mask));
675 mm_end = end - 11;
676 while (s < mm_end)
677 {
678 asm volatile(
679 PREFETCH" 32%1 \n\t"
680 "movd %1, %%mm0 \n\t"
681 "movd 3%1, %%mm3 \n\t"
682 "punpckldq 6%1, %%mm0 \n\t"
683 "punpckldq 9%1, %%mm3 \n\t"
684 "movq %%mm0, %%mm1 \n\t"
685 "movq %%mm0, %%mm2 \n\t"
686 "movq %%mm3, %%mm4 \n\t"
687 "movq %%mm3, %%mm5 \n\t"
688 "psrlq $3, %%mm0 \n\t"
689 "psrlq $3, %%mm3 \n\t"
690 "pand %2, %%mm0 \n\t"
691 "pand %2, %%mm3 \n\t"
692 "psrlq $5, %%mm1 \n\t"
693 "psrlq $5, %%mm4 \n\t"
694 "pand %%mm6, %%mm1 \n\t"
695 "pand %%mm6, %%mm4 \n\t"
696 "psrlq $8, %%mm2 \n\t"
697 "psrlq $8, %%mm5 \n\t"
698 "pand %%mm7, %%mm2 \n\t"
699 "pand %%mm7, %%mm5 \n\t"
700 "por %%mm1, %%mm0 \n\t"
701 "por %%mm4, %%mm3 \n\t"
702 "por %%mm2, %%mm0 \n\t"
703 "por %%mm5, %%mm3 \n\t"
704 "psllq $16, %%mm3 \n\t"
705 "por %%mm3, %%mm0 \n\t"
706 MOVNTQ" %%mm0, %0 \n\t"
707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
708 d += 4;
709 s += 12;
710 }
711 asm volatile(SFENCE:::"memory");
712 asm volatile(EMMS:::"memory");
713#endif
714 while (s < end)
715 {
716 const int b = *s++;
717 const int g = *s++;
718 const int r = *s++;
719 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
720 }
721}
722
723static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
724{
725 const uint8_t *s = src;
726 const uint8_t *end;
727#ifdef HAVE_MMX
728 const uint8_t *mm_end;
729#endif
730 uint16_t *d = (uint16_t *)dst;
731 end = s + src_size;
732#ifdef HAVE_MMX
733 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
734 asm volatile(
735 "movq %0, %%mm7 \n\t"
736 "movq %1, %%mm6 \n\t"
737 ::"m"(red_16mask),"m"(green_16mask));
738 mm_end = end - 15;
739 while (s < mm_end)
740 {
741 asm volatile(
742 PREFETCH" 32%1 \n\t"
743 "movd %1, %%mm0 \n\t"
744 "movd 3%1, %%mm3 \n\t"
745 "punpckldq 6%1, %%mm0 \n\t"
746 "punpckldq 9%1, %%mm3 \n\t"
747 "movq %%mm0, %%mm1 \n\t"
748 "movq %%mm0, %%mm2 \n\t"
749 "movq %%mm3, %%mm4 \n\t"
750 "movq %%mm3, %%mm5 \n\t"
751 "psllq $8, %%mm0 \n\t"
752 "psllq $8, %%mm3 \n\t"
753 "pand %%mm7, %%mm0 \n\t"
754 "pand %%mm7, %%mm3 \n\t"
755 "psrlq $5, %%mm1 \n\t"
756 "psrlq $5, %%mm4 \n\t"
757 "pand %%mm6, %%mm1 \n\t"
758 "pand %%mm6, %%mm4 \n\t"
759 "psrlq $19, %%mm2 \n\t"
760 "psrlq $19, %%mm5 \n\t"
761 "pand %2, %%mm2 \n\t"
762 "pand %2, %%mm5 \n\t"
763 "por %%mm1, %%mm0 \n\t"
764 "por %%mm4, %%mm3 \n\t"
765 "por %%mm2, %%mm0 \n\t"
766 "por %%mm5, %%mm3 \n\t"
767 "psllq $16, %%mm3 \n\t"
768 "por %%mm3, %%mm0 \n\t"
769 MOVNTQ" %%mm0, %0 \n\t"
770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
771 d += 4;
772 s += 12;
773 }
774 asm volatile(SFENCE:::"memory");
775 asm volatile(EMMS:::"memory");
776#endif
777 while (s < end)
778 {
779 const int r = *s++;
780 const int g = *s++;
781 const int b = *s++;
782 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
783 }
784}
785
786static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
787{
788 const uint8_t *s = src;
789 const uint8_t *end;
790#ifdef HAVE_MMX
791 const uint8_t *mm_end;
792#endif
793 uint16_t *d = (uint16_t *)dst;
794 end = s + src_size;
795#ifdef HAVE_MMX
796 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
797 asm volatile(
798 "movq %0, %%mm7 \n\t"
799 "movq %1, %%mm6 \n\t"
800 ::"m"(red_15mask),"m"(green_15mask));
801 mm_end = end - 11;
802 while (s < mm_end)
803 {
804 asm volatile(
805 PREFETCH" 32%1 \n\t"
806 "movd %1, %%mm0 \n\t"
807 "movd 3%1, %%mm3 \n\t"
808 "punpckldq 6%1, %%mm0 \n\t"
809 "punpckldq 9%1, %%mm3 \n\t"
810 "movq %%mm0, %%mm1 \n\t"
811 "movq %%mm0, %%mm2 \n\t"
812 "movq %%mm3, %%mm4 \n\t"
813 "movq %%mm3, %%mm5 \n\t"
814 "psrlq $3, %%mm0 \n\t"
815 "psrlq $3, %%mm3 \n\t"
816 "pand %2, %%mm0 \n\t"
817 "pand %2, %%mm3 \n\t"
818 "psrlq $6, %%mm1 \n\t"
819 "psrlq $6, %%mm4 \n\t"
820 "pand %%mm6, %%mm1 \n\t"
821 "pand %%mm6, %%mm4 \n\t"
822 "psrlq $9, %%mm2 \n\t"
823 "psrlq $9, %%mm5 \n\t"
824 "pand %%mm7, %%mm2 \n\t"
825 "pand %%mm7, %%mm5 \n\t"
826 "por %%mm1, %%mm0 \n\t"
827 "por %%mm4, %%mm3 \n\t"
828 "por %%mm2, %%mm0 \n\t"
829 "por %%mm5, %%mm3 \n\t"
830 "psllq $16, %%mm3 \n\t"
831 "por %%mm3, %%mm0 \n\t"
832 MOVNTQ" %%mm0, %0 \n\t"
833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
834 d += 4;
835 s += 12;
836 }
837 asm volatile(SFENCE:::"memory");
838 asm volatile(EMMS:::"memory");
839#endif
840 while (s < end)
841 {
842 const int b = *s++;
843 const int g = *s++;
844 const int r = *s++;
845 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
846 }
847}
848
849static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
850{
851 const uint8_t *s = src;
852 const uint8_t *end;
853#ifdef HAVE_MMX
854 const uint8_t *mm_end;
855#endif
856 uint16_t *d = (uint16_t *)dst;
857 end = s + src_size;
858#ifdef HAVE_MMX
859 asm volatile(PREFETCH" %0"::"m"(*src):"memory");
860 asm volatile(
861 "movq %0, %%mm7 \n\t"
862 "movq %1, %%mm6 \n\t"
863 ::"m"(red_15mask),"m"(green_15mask));
864 mm_end = end - 15;
865 while (s < mm_end)
866 {
867 asm volatile(
868 PREFETCH" 32%1 \n\t"
869 "movd %1, %%mm0 \n\t"
870 "movd 3%1, %%mm3 \n\t"
871 "punpckldq 6%1, %%mm0 \n\t"
872 "punpckldq 9%1, %%mm3 \n\t"
873 "movq %%mm0, %%mm1 \n\t"
874 "movq %%mm0, %%mm2 \n\t"
875 "movq %%mm3, %%mm4 \n\t"
876 "movq %%mm3, %%mm5 \n\t"
877 "psllq $7, %%mm0 \n\t"
878 "psllq $7, %%mm3 \n\t"
879 "pand %%mm7, %%mm0 \n\t"
880 "pand %%mm7, %%mm3 \n\t"
881 "psrlq $6, %%mm1 \n\t"
882 "psrlq $6, %%mm4 \n\t"
883 "pand %%mm6, %%mm1 \n\t"
884 "pand %%mm6, %%mm4 \n\t"
885 "psrlq $19, %%mm2 \n\t"
886 "psrlq $19, %%mm5 \n\t"
887 "pand %2, %%mm2 \n\t"
888 "pand %2, %%mm5 \n\t"
889 "por %%mm1, %%mm0 \n\t"
890 "por %%mm4, %%mm3 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893 "psllq $16, %%mm3 \n\t"
894 "por %%mm3, %%mm0 \n\t"
895 MOVNTQ" %%mm0, %0 \n\t"
896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
897 d += 4;
898 s += 12;
899 }
900 asm volatile(SFENCE:::"memory");
901 asm volatile(EMMS:::"memory");
902#endif
903 while (s < end)
904 {
905 const int r = *s++;
906 const int g = *s++;
907 const int b = *s++;
908 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
909 }
910}
911
912/*
913 I use less accurate approximation here by simply left-shifting the input
914 value and filling the low order bits with zeroes. This method improves PNG
915 compression but this scheme cannot reproduce white exactly, since it does
916 not generate an all-ones maximum value; the net effect is to darken the
917 image slightly.
918
919 The better method should be "left bit replication":
920
921 4 3 2 1 0
922 ---------
923 1 1 0 1 1
924
925 7 6 5 4 3 2 1 0
926 ----------------
927 1 1 0 1 1 1 1 0
928 |=======| |===|
929 | leftmost bits repeated to fill open bits
930 |
931 original bits
932*/
933static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
934{
935 const uint16_t *end;
936#ifdef HAVE_MMX
937 const uint16_t *mm_end;
938#endif
939 uint8_t *d = dst;
940 const uint16_t *s = (const uint16_t*)src;
941 end = s + src_size/2;
942#ifdef HAVE_MMX
943 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
944 mm_end = end - 7;
945 while (s < mm_end)
946 {
947 asm volatile(
948 PREFETCH" 32%1 \n\t"
949 "movq %1, %%mm0 \n\t"
950 "movq %1, %%mm1 \n\t"
951 "movq %1, %%mm2 \n\t"
952 "pand %2, %%mm0 \n\t"
953 "pand %3, %%mm1 \n\t"
954 "pand %4, %%mm2 \n\t"
955 "psllq $3, %%mm0 \n\t"
956 "psrlq $2, %%mm1 \n\t"
957 "psrlq $7, %%mm2 \n\t"
958 "movq %%mm0, %%mm3 \n\t"
959 "movq %%mm1, %%mm4 \n\t"
960 "movq %%mm2, %%mm5 \n\t"
961 "punpcklwd %5, %%mm0 \n\t"
962 "punpcklwd %5, %%mm1 \n\t"
963 "punpcklwd %5, %%mm2 \n\t"
964 "punpckhwd %5, %%mm3 \n\t"
965 "punpckhwd %5, %%mm4 \n\t"
966 "punpckhwd %5, %%mm5 \n\t"
967 "psllq $8, %%mm1 \n\t"
968 "psllq $16, %%mm2 \n\t"
969 "por %%mm1, %%mm0 \n\t"
970 "por %%mm2, %%mm0 \n\t"
971 "psllq $8, %%mm4 \n\t"
972 "psllq $16, %%mm5 \n\t"
973 "por %%mm4, %%mm3 \n\t"
974 "por %%mm5, %%mm3 \n\t"
975
976 "movq %%mm0, %%mm6 \n\t"
977 "movq %%mm3, %%mm7 \n\t"
978
979 "movq 8%1, %%mm0 \n\t"
980 "movq 8%1, %%mm1 \n\t"
981 "movq 8%1, %%mm2 \n\t"
982 "pand %2, %%mm0 \n\t"
983 "pand %3, %%mm1 \n\t"
984 "pand %4, %%mm2 \n\t"
985 "psllq $3, %%mm0 \n\t"
986 "psrlq $2, %%mm1 \n\t"
987 "psrlq $7, %%mm2 \n\t"
988 "movq %%mm0, %%mm3 \n\t"
989 "movq %%mm1, %%mm4 \n\t"
990 "movq %%mm2, %%mm5 \n\t"
991 "punpcklwd %5, %%mm0 \n\t"
992 "punpcklwd %5, %%mm1 \n\t"
993 "punpcklwd %5, %%mm2 \n\t"
994 "punpckhwd %5, %%mm3 \n\t"
995 "punpckhwd %5, %%mm4 \n\t"
996 "punpckhwd %5, %%mm5 \n\t"
997 "psllq $8, %%mm1 \n\t"
998 "psllq $16, %%mm2 \n\t"
999 "por %%mm1, %%mm0 \n\t"
1000 "por %%mm2, %%mm0 \n\t"
1001 "psllq $8, %%mm4 \n\t"
1002 "psllq $16, %%mm5 \n\t"
1003 "por %%mm4, %%mm3 \n\t"
1004 "por %%mm5, %%mm3 \n\t"
1005
1006 :"=m"(*d)
1007 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1008 :"memory");
1009 /* borrowed 32 to 24 */
1010 asm volatile(
1011 "movq %%mm0, %%mm4 \n\t"
1012 "movq %%mm3, %%mm5 \n\t"
1013 "movq %%mm6, %%mm0 \n\t"
1014 "movq %%mm7, %%mm1 \n\t"
1015
1016 "movq %%mm4, %%mm6 \n\t"
1017 "movq %%mm5, %%mm7 \n\t"
1018 "movq %%mm0, %%mm2 \n\t"
1019 "movq %%mm1, %%mm3 \n\t"
1020
1021 "psrlq $8, %%mm2 \n\t"
1022 "psrlq $8, %%mm3 \n\t"
1023 "psrlq $8, %%mm6 \n\t"
1024 "psrlq $8, %%mm7 \n\t"
1025 "pand %2, %%mm0 \n\t"
1026 "pand %2, %%mm1 \n\t"
1027 "pand %2, %%mm4 \n\t"
1028 "pand %2, %%mm5 \n\t"
1029 "pand %3, %%mm2 \n\t"
1030 "pand %3, %%mm3 \n\t"
1031 "pand %3, %%mm6 \n\t"
1032 "pand %3, %%mm7 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "por %%mm3, %%mm1 \n\t"
1035 "por %%mm6, %%mm4 \n\t"
1036 "por %%mm7, %%mm5 \n\t"
1037
1038 "movq %%mm1, %%mm2 \n\t"
1039 "movq %%mm4, %%mm3 \n\t"
1040 "psllq $48, %%mm2 \n\t"
1041 "psllq $32, %%mm3 \n\t"
1042 "pand %4, %%mm2 \n\t"
1043 "pand %5, %%mm3 \n\t"
1044 "por %%mm2, %%mm0 \n\t"
1045 "psrlq $16, %%mm1 \n\t"
1046 "psrlq $32, %%mm4 \n\t"
1047 "psllq $16, %%mm5 \n\t"
1048 "por %%mm3, %%mm1 \n\t"
1049 "pand %6, %%mm5 \n\t"
1050 "por %%mm5, %%mm4 \n\t"
1051
1052 MOVNTQ" %%mm0, %0 \n\t"
1053 MOVNTQ" %%mm1, 8%0 \n\t"
1054 MOVNTQ" %%mm4, 16%0"
1055
1056 :"=m"(*d)
1057 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1058 :"memory");
1059 d += 24;
1060 s += 8;
1061 }
1062 asm volatile(SFENCE:::"memory");
1063 asm volatile(EMMS:::"memory");
1064#endif
1065 while (s < end)
1066 {
1067 register uint16_t bgr;
1068 bgr = *s++;
1069 *d++ = (bgr&0x1F)<<3;
1070 *d++ = (bgr&0x3E0)>>2;
1071 *d++ = (bgr&0x7C00)>>7;
1072 }
1073}
1074
1075static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1076{
1077 const uint16_t *end;
1078#ifdef HAVE_MMX
1079 const uint16_t *mm_end;
1080#endif
1081 uint8_t *d = (uint8_t *)dst;
1082 const uint16_t *s = (const uint16_t *)src;
1083 end = s + src_size/2;
1084#ifdef HAVE_MMX
1085 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1086 mm_end = end - 7;
1087 while (s < mm_end)
1088 {
1089 asm volatile(
1090 PREFETCH" 32%1 \n\t"
1091 "movq %1, %%mm0 \n\t"
1092 "movq %1, %%mm1 \n\t"
1093 "movq %1, %%mm2 \n\t"
1094 "pand %2, %%mm0 \n\t"
1095 "pand %3, %%mm1 \n\t"
1096 "pand %4, %%mm2 \n\t"
1097 "psllq $3, %%mm0 \n\t"
1098 "psrlq $3, %%mm1 \n\t"
1099 "psrlq $8, %%mm2 \n\t"
1100 "movq %%mm0, %%mm3 \n\t"
1101 "movq %%mm1, %%mm4 \n\t"
1102 "movq %%mm2, %%mm5 \n\t"
1103 "punpcklwd %5, %%mm0 \n\t"
1104 "punpcklwd %5, %%mm1 \n\t"
1105 "punpcklwd %5, %%mm2 \n\t"
1106 "punpckhwd %5, %%mm3 \n\t"
1107 "punpckhwd %5, %%mm4 \n\t"
1108 "punpckhwd %5, %%mm5 \n\t"
1109 "psllq $8, %%mm1 \n\t"
1110 "psllq $16, %%mm2 \n\t"
1111 "por %%mm1, %%mm0 \n\t"
1112 "por %%mm2, %%mm0 \n\t"
1113 "psllq $8, %%mm4 \n\t"
1114 "psllq $16, %%mm5 \n\t"
1115 "por %%mm4, %%mm3 \n\t"
1116 "por %%mm5, %%mm3 \n\t"
1117
1118 "movq %%mm0, %%mm6 \n\t"
1119 "movq %%mm3, %%mm7 \n\t"
1120
1121 "movq 8%1, %%mm0 \n\t"
1122 "movq 8%1, %%mm1 \n\t"
1123 "movq 8%1, %%mm2 \n\t"
1124 "pand %2, %%mm0 \n\t"
1125 "pand %3, %%mm1 \n\t"
1126 "pand %4, %%mm2 \n\t"
1127 "psllq $3, %%mm0 \n\t"
1128 "psrlq $3, %%mm1 \n\t"
1129 "psrlq $8, %%mm2 \n\t"
1130 "movq %%mm0, %%mm3 \n\t"
1131 "movq %%mm1, %%mm4 \n\t"
1132 "movq %%mm2, %%mm5 \n\t"
1133 "punpcklwd %5, %%mm0 \n\t"
1134 "punpcklwd %5, %%mm1 \n\t"
1135 "punpcklwd %5, %%mm2 \n\t"
1136 "punpckhwd %5, %%mm3 \n\t"
1137 "punpckhwd %5, %%mm4 \n\t"
1138 "punpckhwd %5, %%mm5 \n\t"
1139 "psllq $8, %%mm1 \n\t"
1140 "psllq $16, %%mm2 \n\t"
1141 "por %%mm1, %%mm0 \n\t"
1142 "por %%mm2, %%mm0 \n\t"
1143 "psllq $8, %%mm4 \n\t"
1144 "psllq $16, %%mm5 \n\t"
1145 "por %%mm4, %%mm3 \n\t"
1146 "por %%mm5, %%mm3 \n\t"
1147 :"=m"(*d)
1148 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1149 :"memory");
1150 /* borrowed 32 to 24 */
1151 asm volatile(
1152 "movq %%mm0, %%mm4 \n\t"
1153 "movq %%mm3, %%mm5 \n\t"
1154 "movq %%mm6, %%mm0 \n\t"
1155 "movq %%mm7, %%mm1 \n\t"
1156
1157 "movq %%mm4, %%mm6 \n\t"
1158 "movq %%mm5, %%mm7 \n\t"
1159 "movq %%mm0, %%mm2 \n\t"
1160 "movq %%mm1, %%mm3 \n\t"
1161
1162 "psrlq $8, %%mm2 \n\t"
1163 "psrlq $8, %%mm3 \n\t"
1164 "psrlq $8, %%mm6 \n\t"
1165 "psrlq $8, %%mm7 \n\t"
1166 "pand %2, %%mm0 \n\t"
1167 "pand %2, %%mm1 \n\t"
1168 "pand %2, %%mm4 \n\t"
1169 "pand %2, %%mm5 \n\t"
1170 "pand %3, %%mm2 \n\t"
1171 "pand %3, %%mm3 \n\t"
1172 "pand %3, %%mm6 \n\t"
1173 "pand %3, %%mm7 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "por %%mm3, %%mm1 \n\t"
1176 "por %%mm6, %%mm4 \n\t"
1177 "por %%mm7, %%mm5 \n\t"
1178
1179 "movq %%mm1, %%mm2 \n\t"
1180 "movq %%mm4, %%mm3 \n\t"
1181 "psllq $48, %%mm2 \n\t"
1182 "psllq $32, %%mm3 \n\t"
1183 "pand %4, %%mm2 \n\t"
1184 "pand %5, %%mm3 \n\t"
1185 "por %%mm2, %%mm0 \n\t"
1186 "psrlq $16, %%mm1 \n\t"
1187 "psrlq $32, %%mm4 \n\t"
1188 "psllq $16, %%mm5 \n\t"
1189 "por %%mm3, %%mm1 \n\t"
1190 "pand %6, %%mm5 \n\t"
1191 "por %%mm5, %%mm4 \n\t"
1192
1193 MOVNTQ" %%mm0, %0 \n\t"
1194 MOVNTQ" %%mm1, 8%0 \n\t"
1195 MOVNTQ" %%mm4, 16%0"
1196
1197 :"=m"(*d)
1198 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1199 :"memory");
1200 d += 24;
1201 s += 8;
1202 }
1203 asm volatile(SFENCE:::"memory");
1204 asm volatile(EMMS:::"memory");
1205#endif
1206 while (s < end)
1207 {
1208 register uint16_t bgr;
1209 bgr = *s++;
1210 *d++ = (bgr&0x1F)<<3;
1211 *d++ = (bgr&0x7E0)>>3;
1212 *d++ = (bgr&0xF800)>>8;
1213 }
1214}
1215
1216static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1217{
1218 const uint16_t *end;
1219#ifdef HAVE_MMX
1220 const uint16_t *mm_end;
1221#endif
1222 uint8_t *d = dst;
1223 const uint16_t *s = (const uint16_t *)src;
1224 end = s + src_size/2;
1225#ifdef HAVE_MMX
1226 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1227 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1228 mm_end = end - 3;
1229 while (s < mm_end)
1230 {
1231 asm volatile(
1232 PREFETCH" 32%1 \n\t"
1233 "movq %1, %%mm0 \n\t"
1234 "movq %1, %%mm1 \n\t"
1235 "movq %1, %%mm2 \n\t"
1236 "pand %2, %%mm0 \n\t"
1237 "pand %3, %%mm1 \n\t"
1238 "pand %4, %%mm2 \n\t"
1239 "psllq $3, %%mm0 \n\t"
1240 "psrlq $2, %%mm1 \n\t"
1241 "psrlq $7, %%mm2 \n\t"
1242 "movq %%mm0, %%mm3 \n\t"
1243 "movq %%mm1, %%mm4 \n\t"
1244 "movq %%mm2, %%mm5 \n\t"
1245 "punpcklwd %%mm7, %%mm0 \n\t"
1246 "punpcklwd %%mm7, %%mm1 \n\t"
1247 "punpcklwd %%mm7, %%mm2 \n\t"
1248 "punpckhwd %%mm7, %%mm3 \n\t"
1249 "punpckhwd %%mm7, %%mm4 \n\t"
1250 "punpckhwd %%mm7, %%mm5 \n\t"
1251 "psllq $8, %%mm1 \n\t"
1252 "psllq $16, %%mm2 \n\t"
1253 "por %%mm1, %%mm0 \n\t"
1254 "por %%mm2, %%mm0 \n\t"
1255 "psllq $8, %%mm4 \n\t"
1256 "psllq $16, %%mm5 \n\t"
1257 "por %%mm4, %%mm3 \n\t"
1258 "por %%mm5, %%mm3 \n\t"
1259 MOVNTQ" %%mm0, %0 \n\t"
1260 MOVNTQ" %%mm3, 8%0 \n\t"
1261 :"=m"(*d)
1262 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1263 :"memory");
1264 d += 16;
1265 s += 4;
1266 }
1267 asm volatile(SFENCE:::"memory");
1268 asm volatile(EMMS:::"memory");
1269#endif
1270 while (s < end)
1271 {
1272#if 0 //slightly slower on Athlon
1273 int bgr= *s++;
1274 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1275#else
1276 register uint16_t bgr;
1277 bgr = *s++;
1278#ifdef WORDS_BIGENDIAN
1279 *d++ = 0;
1280 *d++ = (bgr&0x7C00)>>7;
1281 *d++ = (bgr&0x3E0)>>2;
1282 *d++ = (bgr&0x1F)<<3;
1283#else
1284 *d++ = (bgr&0x1F)<<3;
1285 *d++ = (bgr&0x3E0)>>2;
1286 *d++ = (bgr&0x7C00)>>7;
1287 *d++ = 0;
1288#endif
1289
1290#endif
1291 }
1292}
1293
1294static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1295{
1296 const uint16_t *end;
1297#ifdef HAVE_MMX
1298 const uint16_t *mm_end;
1299#endif
1300 uint8_t *d = dst;
1301 const uint16_t *s = (const uint16_t*)src;
1302 end = s + src_size/2;
1303#ifdef HAVE_MMX
1304 asm volatile(PREFETCH" %0"::"m"(*s):"memory");
1305 asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1306 mm_end = end - 3;
1307 while (s < mm_end)
1308 {
1309 asm volatile(
1310 PREFETCH" 32%1 \n\t"
1311 "movq %1, %%mm0 \n\t"
1312 "movq %1, %%mm1 \n\t"
1313 "movq %1, %%mm2 \n\t"
1314 "pand %2, %%mm0 \n\t"
1315 "pand %3, %%mm1 \n\t"
1316 "pand %4, %%mm2 \n\t"
1317 "psllq $3, %%mm0 \n\t"
1318 "psrlq $3, %%mm1 \n\t"
1319 "psrlq $8, %%mm2 \n\t"
1320 "movq %%mm0, %%mm3 \n\t"
1321 "movq %%mm1, %%mm4 \n\t"
1322 "movq %%mm2, %%mm5 \n\t"
1323 "punpcklwd %%mm7, %%mm0 \n\t"
1324 "punpcklwd %%mm7, %%mm1 \n\t"
1325 "punpcklwd %%mm7, %%mm2 \n\t"
1326 "punpckhwd %%mm7, %%mm3 \n\t"
1327 "punpckhwd %%mm7, %%mm4 \n\t"
1328 "punpckhwd %%mm7, %%mm5 \n\t"
1329 "psllq $8, %%mm1 \n\t"
1330 "psllq $16, %%mm2 \n\t"
1331 "por %%mm1, %%mm0 \n\t"
1332 "por %%mm2, %%mm0 \n\t"
1333 "psllq $8, %%mm4 \n\t"
1334 "psllq $16, %%mm5 \n\t"
1335 "por %%mm4, %%mm3 \n\t"
1336 "por %%mm5, %%mm3 \n\t"
1337 MOVNTQ" %%mm0, %0 \n\t"
1338 MOVNTQ" %%mm3, 8%0 \n\t"
1339 :"=m"(*d)
1340 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1341 :"memory");
1342 d += 16;
1343 s += 4;
1344 }
1345 asm volatile(SFENCE:::"memory");
1346 asm volatile(EMMS:::"memory");
1347#endif
1348 while (s < end)
1349 {
1350 register uint16_t bgr;
1351 bgr = *s++;
1352#ifdef WORDS_BIGENDIAN
1353 *d++ = 0;
1354 *d++ = (bgr&0xF800)>>8;
1355 *d++ = (bgr&0x7E0)>>3;
1356 *d++ = (bgr&0x1F)<<3;
1357#else
1358 *d++ = (bgr&0x1F)<<3;
1359 *d++ = (bgr&0x7E0)>>3;
1360 *d++ = (bgr&0xF800)>>8;
1361 *d++ = 0;
1362#endif
1363 }
1364}
1365
1366static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1367{
1368 long idx = 15 - src_size;
1369 const uint8_t *s = src-idx;
1370 uint8_t *d = dst-idx;
1371#ifdef HAVE_MMX
1372 asm volatile(
1373 "test %0, %0 \n\t"
1374 "jns 2f \n\t"
1375 PREFETCH" (%1, %0) \n\t"
1376 "movq %3, %%mm7 \n\t"
1377 "pxor %4, %%mm7 \n\t"
1378 "movq %%mm7, %%mm6 \n\t"
1379 "pxor %5, %%mm7 \n\t"
1380 ASMALIGN(4)
1381 "1: \n\t"
1382 PREFETCH" 32(%1, %0) \n\t"
1383 "movq (%1, %0), %%mm0 \n\t"
1384 "movq 8(%1, %0), %%mm1 \n\t"
1385# ifdef HAVE_MMX2
1386 "pshufw $177, %%mm0, %%mm3 \n\t"
1387 "pshufw $177, %%mm1, %%mm5 \n\t"
1388 "pand %%mm7, %%mm0 \n\t"
1389 "pand %%mm6, %%mm3 \n\t"
1390 "pand %%mm7, %%mm1 \n\t"
1391 "pand %%mm6, %%mm5 \n\t"
1392 "por %%mm3, %%mm0 \n\t"
1393 "por %%mm5, %%mm1 \n\t"
1394# else
1395 "movq %%mm0, %%mm2 \n\t"
1396 "movq %%mm1, %%mm4 \n\t"
1397 "pand %%mm7, %%mm0 \n\t"
1398 "pand %%mm6, %%mm2 \n\t"
1399 "pand %%mm7, %%mm1 \n\t"
1400 "pand %%mm6, %%mm4 \n\t"
1401 "movq %%mm2, %%mm3 \n\t"
1402 "movq %%mm4, %%mm5 \n\t"
1403 "pslld $16, %%mm2 \n\t"
1404 "psrld $16, %%mm3 \n\t"
1405 "pslld $16, %%mm4 \n\t"
1406 "psrld $16, %%mm5 \n\t"
1407 "por %%mm2, %%mm0 \n\t"
1408 "por %%mm4, %%mm1 \n\t"
1409 "por %%mm3, %%mm0 \n\t"
1410 "por %%mm5, %%mm1 \n\t"
1411# endif
1412 MOVNTQ" %%mm0, (%2, %0) \n\t"
1413 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1414 "add $16, %0 \n\t"
1415 "js 1b \n\t"
1416 SFENCE" \n\t"
1417 EMMS" \n\t"
1418 "2: \n\t"
1419 : "+&r"(idx)
1420 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1421 : "memory");
1422#endif
1423 for (; idx<15; idx+=4) {
1424 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1425 v &= 0xff00ff;
1426 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1427 }
1428}
1429
1430static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1431{
1432 unsigned i;
1433#ifdef HAVE_MMX
1434 long mmx_size= 23 - src_size;
1435 asm volatile (
1436 "test %%"REG_a", %%"REG_a" \n\t"
1437 "jns 2f \n\t"
1438 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1439 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1440 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1441 ASMALIGN(4)
1442 "1: \n\t"
1443 PREFETCH" 32(%1, %%"REG_a") \n\t"
1444 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1445 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1446 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1447 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1448 "pand %%mm5, %%mm0 \n\t"
1449 "pand %%mm6, %%mm1 \n\t"
1450 "pand %%mm7, %%mm2 \n\t"
1451 "por %%mm0, %%mm1 \n\t"
1452 "por %%mm2, %%mm1 \n\t"
1453 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1454 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1455 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1456 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1457 "pand %%mm7, %%mm0 \n\t"
1458 "pand %%mm5, %%mm1 \n\t"
1459 "pand %%mm6, %%mm2 \n\t"
1460 "por %%mm0, %%mm1 \n\t"
1461 "por %%mm2, %%mm1 \n\t"
1462 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1463 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1464 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1465 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1466 "pand %%mm6, %%mm0 \n\t"
1467 "pand %%mm7, %%mm1 \n\t"
1468 "pand %%mm5, %%mm2 \n\t"
1469 "por %%mm0, %%mm1 \n\t"
1470 "por %%mm2, %%mm1 \n\t"
1471 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1472 "add $24, %%"REG_a" \n\t"
1473 " js 1b \n\t"
1474 "2: \n\t"
1475 : "+a" (mmx_size)
1476 : "r" (src-mmx_size), "r"(dst-mmx_size)
1477 );
1478
1479 asm volatile(SFENCE:::"memory");
1480 asm volatile(EMMS:::"memory");
1481
1482 if (mmx_size==23) return; //finished, was multiple of 8
1483
1484 src+= src_size;
1485 dst+= src_size;
1486 src_size= 23-mmx_size;
1487 src-= src_size;
1488 dst-= src_size;
1489#endif
1490 for (i=0; i<src_size; i+=3)
1491 {
1492 register uint8_t x;
1493 x = src[i + 2];
1494 dst[i + 1] = src[i + 1];
1495 dst[i + 2] = src[i + 0];
1496 dst[i + 0] = x;
1497 }
1498}
1499
1500static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1501 long width, long height,
1502 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1503{
1504 long y;
1505 const long chromWidth= width>>1;
1506 for (y=0; y<height; y++)
1507 {
1508#ifdef HAVE_MMX
1509//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1510 asm volatile(
1511 "xor %%"REG_a", %%"REG_a" \n\t"
1512 ASMALIGN(4)
1513 "1: \n\t"
1514 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1515 PREFETCH" 32(%2, %%"REG_a") \n\t"
1516 PREFETCH" 32(%3, %%"REG_a") \n\t"
1517 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1518 "movq %%mm0, %%mm2 \n\t" // U(0)
1519 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1520 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1521 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1522
1523 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1524 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1525 "movq %%mm3, %%mm4 \n\t" // Y(0)
1526 "movq %%mm5, %%mm6 \n\t" // Y(8)
1527 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1528 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1529 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1530 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1531
1532 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1533 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1534 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1535 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1536
1537 "add $8, %%"REG_a" \n\t"
1538 "cmp %4, %%"REG_a" \n\t"
1539 " jb 1b \n\t"
1540 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1541 : "%"REG_a
1542 );
1543#else
1544
1545#if defined ARCH_ALPHA && defined HAVE_MVI
1546#define pl2yuy2(n) \
1547 y1 = yc[n]; \
1548 y2 = yc2[n]; \
1549 u = uc[n]; \
1550 v = vc[n]; \
1551 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1552 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1553 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1554 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1555 yuv1 = (u << 8) + (v << 24); \
1556 yuv2 = yuv1 + y2; \
1557 yuv1 += y1; \
1558 qdst[n] = yuv1; \
1559 qdst2[n] = yuv2;
1560
1561 int i;
1562 uint64_t *qdst = (uint64_t *) dst;
1563 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1564 const uint32_t *yc = (uint32_t *) ysrc;
1565 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1566 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1567 for (i = 0; i < chromWidth; i += 8){
1568 uint64_t y1, y2, yuv1, yuv2;
1569 uint64_t u, v;
1570 /* Prefetch */
1571 asm("ldq $31,64(%0)" :: "r"(yc));
1572 asm("ldq $31,64(%0)" :: "r"(yc2));
1573 asm("ldq $31,64(%0)" :: "r"(uc));
1574 asm("ldq $31,64(%0)" :: "r"(vc));
1575
1576 pl2yuy2(0);
1577 pl2yuy2(1);
1578 pl2yuy2(2);
1579 pl2yuy2(3);
1580
1581 yc += 4;
1582 yc2 += 4;
1583 uc += 4;
1584 vc += 4;
1585 qdst += 4;
1586 qdst2 += 4;
1587 }
1588 y++;
1589 ysrc += lumStride;
1590 dst += dstStride;
1591
1592#elif __WORDSIZE >= 64
1593 int i;
1594 uint64_t *ldst = (uint64_t *) dst;
1595 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1596 for (i = 0; i < chromWidth; i += 2){
1597 uint64_t k, l;
1598 k = yc[0] + (uc[0] << 8) +
1599 (yc[1] << 16) + (vc[0] << 24);
1600 l = yc[2] + (uc[1] << 8) +
1601 (yc[3] << 16) + (vc[1] << 24);
1602 *ldst++ = k + (l << 32);
1603 yc += 4;
1604 uc += 2;
1605 vc += 2;
1606 }
1607
1608#else
1609 int i, *idst = (int32_t *) dst;
1610 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1611 for (i = 0; i < chromWidth; i++){
1612#ifdef WORDS_BIGENDIAN
1613 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1614 (yc[1] << 8) + (vc[0] << 0);
1615#else
1616 *idst++ = yc[0] + (uc[0] << 8) +
1617 (yc[1] << 16) + (vc[0] << 24);
1618#endif
1619 yc += 2;
1620 uc++;
1621 vc++;
1622 }
1623#endif
1624#endif
1625 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1626 {
1627 usrc += chromStride;
1628 vsrc += chromStride;
1629 }
1630 ysrc += lumStride;
1631 dst += dstStride;
1632 }
1633#ifdef HAVE_MMX
1634asm( EMMS" \n\t"
1635 SFENCE" \n\t"
1636 :::"memory");
1637#endif
1638}
1639
1640/**
1641 * Height should be a multiple of 2 and width should be a multiple of 16.
1642 * (If this is a problem for anyone then tell me, and I will fix it.)
1643 */
1644static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1645 long width, long height,
1646 long lumStride, long chromStride, long dstStride)
1647{
1648 //FIXME interpolate chroma
1649 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1650}
1651
1652static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1653 long width, long height,
1654 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1655{
1656 long y;
1657 const long chromWidth= width>>1;
1658 for (y=0; y<height; y++)
1659 {
1660#ifdef HAVE_MMX
1661//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1662 asm volatile(
1663 "xor %%"REG_a", %%"REG_a" \n\t"
1664 ASMALIGN(4)
1665 "1: \n\t"
1666 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1667 PREFETCH" 32(%2, %%"REG_a") \n\t"
1668 PREFETCH" 32(%3, %%"REG_a") \n\t"
1669 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1670 "movq %%mm0, %%mm2 \n\t" // U(0)
1671 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1672 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1673 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1674
1675 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1676 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1677 "movq %%mm0, %%mm4 \n\t" // Y(0)
1678 "movq %%mm2, %%mm6 \n\t" // Y(8)
1679 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1680 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1681 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1682 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1683
1684 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1685 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1686 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1687 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1688
1689 "add $8, %%"REG_a" \n\t"
1690 "cmp %4, %%"REG_a" \n\t"
1691 " jb 1b \n\t"
1692 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1693 : "%"REG_a
1694 );
1695#else
1696//FIXME adapt the Alpha ASM code from yv12->yuy2
1697
1698#if __WORDSIZE >= 64
1699 int i;
1700 uint64_t *ldst = (uint64_t *) dst;
1701 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1702 for (i = 0; i < chromWidth; i += 2){
1703 uint64_t k, l;
1704 k = uc[0] + (yc[0] << 8) +
1705 (vc[0] << 16) + (yc[1] << 24);
1706 l = uc[1] + (yc[2] << 8) +
1707 (vc[1] << 16) + (yc[3] << 24);
1708 *ldst++ = k + (l << 32);
1709 yc += 4;
1710 uc += 2;
1711 vc += 2;
1712 }
1713
1714#else
1715 int i, *idst = (int32_t *) dst;
1716 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1717 for (i = 0; i < chromWidth; i++){
1718#ifdef WORDS_BIGENDIAN
1719 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1720 (vc[0] << 8) + (yc[1] << 0);
1721#else
1722 *idst++ = uc[0] + (yc[0] << 8) +
1723 (vc[0] << 16) + (yc[1] << 24);
1724#endif
1725 yc += 2;
1726 uc++;
1727 vc++;
1728 }
1729#endif
1730#endif
1731 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1732 {
1733 usrc += chromStride;
1734 vsrc += chromStride;
1735 }
1736 ysrc += lumStride;
1737 dst += dstStride;
1738 }
1739#ifdef HAVE_MMX
1740asm( EMMS" \n\t"
1741 SFENCE" \n\t"
1742 :::"memory");
1743#endif
1744}
1745
1746/**
1747 * Height should be a multiple of 2 and width should be a multiple of 16
1748 * (If this is a problem for anyone then tell me, and I will fix it.)
1749 */
1750static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751 long width, long height,
1752 long lumStride, long chromStride, long dstStride)
1753{
1754 //FIXME interpolate chroma
1755 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1756}
1757
1758/**
1759 * Width should be a multiple of 16.
1760 */
1761static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1762 long width, long height,
1763 long lumStride, long chromStride, long dstStride)
1764{
1765 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1766}
1767
1768/**
1769 * Height should be a multiple of 2 and width should be a multiple of 16.
1770 * (If this is a problem for anyone then tell me, and I will fix it.)
1771 */
1772static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1773 long width, long height,
1774 long lumStride, long chromStride, long srcStride)
1775{
1776 long y;
1777 const long chromWidth= width>>1;
1778 for (y=0; y<height; y+=2)
1779 {
1780#ifdef HAVE_MMX
1781 asm volatile(
1782 "xor %%"REG_a", %%"REG_a" \n\t"
1783 "pcmpeqw %%mm7, %%mm7 \n\t"
1784 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1785 ASMALIGN(4)
1786 "1: \n\t"
1787 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1788 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1789 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1790 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1791 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1792 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1793 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1794 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1795 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1796 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1797 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1798
1799 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1800
1801 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1802 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1803 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1804 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1805 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1806 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1807 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1808 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1809 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1810 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1811
1812 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1813
1814 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1815 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1816 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1817 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1818 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1819 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1820 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1821 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1822
1823 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1824 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1825
1826 "add $8, %%"REG_a" \n\t"
1827 "cmp %4, %%"REG_a" \n\t"
1828 " jb 1b \n\t"
1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830 : "memory", "%"REG_a
1831 );
1832
1833 ydst += lumStride;
1834 src += srcStride;
1835
1836 asm volatile(
1837 "xor %%"REG_a", %%"REG_a" \n\t"
1838 ASMALIGN(4)
1839 "1: \n\t"
1840 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1841 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1842 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1843 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1844 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1845 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1846 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1847 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1848 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1849 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1850 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1851
1852 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1853 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1854
1855 "add $8, %%"REG_a" \n\t"
1856 "cmp %4, %%"REG_a" \n\t"
1857 " jb 1b \n\t"
1858
1859 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1860 : "memory", "%"REG_a
1861 );
1862#else
1863 long i;
1864 for (i=0; i<chromWidth; i++)
1865 {
1866 ydst[2*i+0] = src[4*i+0];
1867 udst[i] = src[4*i+1];
1868 ydst[2*i+1] = src[4*i+2];
1869 vdst[i] = src[4*i+3];
1870 }
1871 ydst += lumStride;
1872 src += srcStride;
1873
1874 for (i=0; i<chromWidth; i++)
1875 {
1876 ydst[2*i+0] = src[4*i+0];
1877 ydst[2*i+1] = src[4*i+2];
1878 }
1879#endif
1880 udst += chromStride;
1881 vdst += chromStride;
1882 ydst += lumStride;
1883 src += srcStride;
1884 }
1885#ifdef HAVE_MMX
1886asm volatile( EMMS" \n\t"
1887 SFENCE" \n\t"
1888 :::"memory");
1889#endif
1890}
1891
1892static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1893 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1894 long width, long height, long lumStride, long chromStride)
1895{
1896 /* Y Plane */
1897 memcpy(ydst, ysrc, width*height);
1898
1899 /* XXX: implement upscaling for U,V */
1900}
1901
1902static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1903{
1904 long x,y;
1905
1906 dst[0]= src[0];
1907
1908 // first line
1909 for (x=0; x<srcWidth-1; x++){
1910 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1911 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1912 }
1913 dst[2*srcWidth-1]= src[srcWidth-1];
1914
1915 dst+= dstStride;
1916
1917 for (y=1; y<srcHeight; y++){
1918#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1919 const long mmxSize= srcWidth&~15;
1920 asm volatile(
1921 "mov %4, %%"REG_a" \n\t"
1922 "1: \n\t"
1923 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1924 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1925 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1926 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1927 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1928 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1929 PAVGB" %%mm0, %%mm5 \n\t"
1930 PAVGB" %%mm0, %%mm3 \n\t"
1931 PAVGB" %%mm0, %%mm5 \n\t"
1932 PAVGB" %%mm0, %%mm3 \n\t"
1933 PAVGB" %%mm1, %%mm4 \n\t"
1934 PAVGB" %%mm1, %%mm2 \n\t"
1935 PAVGB" %%mm1, %%mm4 \n\t"
1936 PAVGB" %%mm1, %%mm2 \n\t"
1937 "movq %%mm5, %%mm7 \n\t"
1938 "movq %%mm4, %%mm6 \n\t"
1939 "punpcklbw %%mm3, %%mm5 \n\t"
1940 "punpckhbw %%mm3, %%mm7 \n\t"
1941 "punpcklbw %%mm2, %%mm4 \n\t"
1942 "punpckhbw %%mm2, %%mm6 \n\t"
1943#if 1
1944 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1945 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1946 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1947 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1948#else
1949 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1950 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1951 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1952 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1953#endif
1954 "add $8, %%"REG_a" \n\t"
1955 " js 1b \n\t"
1956 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1957 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1958 "g" (-mmxSize)
1959 : "%"REG_a
1960
1961 );
1962#else
1963 const long mmxSize=1;
1964#endif
1965 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1966 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1967
1968 for (x=mmxSize-1; x<srcWidth-1; x++){
1969 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1970 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1971 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1972 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1973 }
1974 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1975 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1976
1977 dst+=dstStride*2;
1978 src+=srcStride;
1979 }
1980
1981 // last line
1982#if 1
1983 dst[0]= src[0];
1984
1985 for (x=0; x<srcWidth-1; x++){
1986 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1987 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1988 }
1989 dst[2*srcWidth-1]= src[srcWidth-1];
1990#else
1991 for (x=0; x<srcWidth; x++){
1992 dst[2*x+0]=
1993 dst[2*x+1]= src[x];
1994 }
1995#endif
1996
1997#ifdef HAVE_MMX
1998asm volatile( EMMS" \n\t"
1999 SFENCE" \n\t"
2000 :::"memory");
2001#endif
2002}
2003
2004/**
2005 * Height should be a multiple of 2 and width should be a multiple of 16.
2006 * (If this is a problem for anyone then tell me, and I will fix it.)
2007 * Chrominance data is only taken from every second line, others are ignored.
2008 * FIXME: Write HQ version.
2009 */
2010static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2011 long width, long height,
2012 long lumStride, long chromStride, long srcStride)
2013{
2014 long y;
2015 const long chromWidth= width>>1;
2016 for (y=0; y<height; y+=2)
2017 {
2018#ifdef HAVE_MMX
2019 asm volatile(
2020 "xorl %%eax, %%eax \n\t"
2021 "pcmpeqw %%mm7, %%mm7 \n\t"
2022 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2023 ASMALIGN(4)
2024 "1: \n\t"
2025 PREFETCH" 64(%0, %%eax, 4) \n\t"
2026 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2027 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2028 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2029 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2030 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2031 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2032 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2033 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2034 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2035 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2036
2037 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2038
2039 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2040 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2041 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2042 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2043 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2044 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2045 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2046 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2047 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2048 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2049
2050 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2051
2052 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2053 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2054 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2055 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2056 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2057 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2058 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2059 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2060
2061 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2062 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2063
2064 "addl $8, %%eax \n\t"
2065 "cmpl %4, %%eax \n\t"
2066 " jb 1b \n\t"
2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2068 : "memory", "%eax"
2069 );
2070
2071 ydst += lumStride;
2072 src += srcStride;
2073
2074 asm volatile(
2075 "xorl %%eax, %%eax \n\t"
2076 ASMALIGN(4)
2077 "1: \n\t"
2078 PREFETCH" 64(%0, %%eax, 4) \n\t"
2079 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2080 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2081 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2082 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2083 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2084 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2085 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2086 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2087 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2088 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2089
2090 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2091 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2092
2093 "addl $8, %%eax \n\t"
2094 "cmpl %4, %%eax \n\t"
2095 " jb 1b \n\t"
2096
2097 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2098 : "memory", "%eax"
2099 );
2100#else
2101 long i;
2102 for (i=0; i<chromWidth; i++)
2103 {
2104 udst[i] = src[4*i+0];
2105 ydst[2*i+0] = src[4*i+1];
2106 vdst[i] = src[4*i+2];
2107 ydst[2*i+1] = src[4*i+3];
2108 }
2109 ydst += lumStride;
2110 src += srcStride;
2111
2112 for (i=0; i<chromWidth; i++)
2113 {
2114 ydst[2*i+0] = src[4*i+1];
2115 ydst[2*i+1] = src[4*i+3];
2116 }
2117#endif
2118 udst += chromStride;
2119 vdst += chromStride;
2120 ydst += lumStride;
2121 src += srcStride;
2122 }
2123#ifdef HAVE_MMX
2124asm volatile( EMMS" \n\t"
2125 SFENCE" \n\t"
2126 :::"memory");
2127#endif
2128}
2129
2130/**
2131 * Height should be a multiple of 2 and width should be a multiple of 2.
2132 * (If this is a problem for anyone then tell me, and I will fix it.)
2133 * Chrominance data is only taken from every second line,
2134 * others are ignored in the C version.
2135 * FIXME: Write HQ version.
2136 */
2137static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2138 long width, long height,
2139 long lumStride, long chromStride, long srcStride)
2140{
2141 long y;
2142 const long chromWidth= width>>1;
2143#ifdef HAVE_MMX
2144 for (y=0; y<height-2; y+=2)
2145 {
2146 long i;
2147 for (i=0; i<2; i++)
2148 {
2149 asm volatile(
2150 "mov %2, %%"REG_a" \n\t"
2151 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2152 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2153 "pxor %%mm7, %%mm7 \n\t"
2154 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2155 ASMALIGN(4)
2156 "1: \n\t"
2157 PREFETCH" 64(%0, %%"REG_d") \n\t"
2158 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2159 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2160 "punpcklbw %%mm7, %%mm0 \n\t"
2161 "punpcklbw %%mm7, %%mm1 \n\t"
2162 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2163 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2164 "punpcklbw %%mm7, %%mm2 \n\t"
2165 "punpcklbw %%mm7, %%mm3 \n\t"
2166 "pmaddwd %%mm6, %%mm0 \n\t"
2167 "pmaddwd %%mm6, %%mm1 \n\t"
2168 "pmaddwd %%mm6, %%mm2 \n\t"
2169 "pmaddwd %%mm6, %%mm3 \n\t"
2170#ifndef FAST_BGR2YV12
2171 "psrad $8, %%mm0 \n\t"
2172 "psrad $8, %%mm1 \n\t"
2173 "psrad $8, %%mm2 \n\t"
2174 "psrad $8, %%mm3 \n\t"
2175#endif
2176 "packssdw %%mm1, %%mm0 \n\t"
2177 "packssdw %%mm3, %%mm2 \n\t"
2178 "pmaddwd %%mm5, %%mm0 \n\t"
2179 "pmaddwd %%mm5, %%mm2 \n\t"
2180 "packssdw %%mm2, %%mm0 \n\t"
2181 "psraw $7, %%mm0 \n\t"
2182
2183 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2184 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2185 "punpcklbw %%mm7, %%mm4 \n\t"
2186 "punpcklbw %%mm7, %%mm1 \n\t"
2187 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2188 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2189 "punpcklbw %%mm7, %%mm2 \n\t"
2190 "punpcklbw %%mm7, %%mm3 \n\t"
2191 "pmaddwd %%mm6, %%mm4 \n\t"
2192 "pmaddwd %%mm6, %%mm1 \n\t"
2193 "pmaddwd %%mm6, %%mm2 \n\t"
2194 "pmaddwd %%mm6, %%mm3 \n\t"
2195#ifndef FAST_BGR2YV12
2196 "psrad $8, %%mm4 \n\t"
2197 "psrad $8, %%mm1 \n\t"
2198 "psrad $8, %%mm2 \n\t"
2199 "psrad $8, %%mm3 \n\t"
2200#endif
2201 "packssdw %%mm1, %%mm4 \n\t"
2202 "packssdw %%mm3, %%mm2 \n\t"
2203 "pmaddwd %%mm5, %%mm4 \n\t"
2204 "pmaddwd %%mm5, %%mm2 \n\t"
2205 "add $24, %%"REG_d" \n\t"
2206 "packssdw %%mm2, %%mm4 \n\t"
2207 "psraw $7, %%mm4 \n\t"
2208
2209 "packuswb %%mm4, %%mm0 \n\t"
2210 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2211
2212 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2213 "add $8, %%"REG_a" \n\t"
2214 " js 1b \n\t"
2215 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2216 : "%"REG_a, "%"REG_d
2217 );
2218 ydst += lumStride;
2219 src += srcStride;
2220 }
2221 src -= srcStride*2;
2222 asm volatile(
2223 "mov %4, %%"REG_a" \n\t"
2224 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2225 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2226 "pxor %%mm7, %%mm7 \n\t"
2227 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2228 "add %%"REG_d", %%"REG_d" \n\t"
2229 ASMALIGN(4)
2230 "1: \n\t"
2231 PREFETCH" 64(%0, %%"REG_d") \n\t"
2232 PREFETCH" 64(%1, %%"REG_d") \n\t"
2233#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2234 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2235 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2236 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2237 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2238 PAVGB" %%mm1, %%mm0 \n\t"
2239 PAVGB" %%mm3, %%mm2 \n\t"
2240 "movq %%mm0, %%mm1 \n\t"
2241 "movq %%mm2, %%mm3 \n\t"
2242 "psrlq $24, %%mm0 \n\t"
2243 "psrlq $24, %%mm2 \n\t"
2244 PAVGB" %%mm1, %%mm0 \n\t"
2245 PAVGB" %%mm3, %%mm2 \n\t"
2246 "punpcklbw %%mm7, %%mm0 \n\t"
2247 "punpcklbw %%mm7, %%mm2 \n\t"
2248#else
2249 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2250 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2251 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2252 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2253 "punpcklbw %%mm7, %%mm0 \n\t"
2254 "punpcklbw %%mm7, %%mm1 \n\t"
2255 "punpcklbw %%mm7, %%mm2 \n\t"
2256 "punpcklbw %%mm7, %%mm3 \n\t"
2257 "paddw %%mm1, %%mm0 \n\t"
2258 "paddw %%mm3, %%mm2 \n\t"
2259 "paddw %%mm2, %%mm0 \n\t"
2260 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2261 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2262 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2263 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2264 "punpcklbw %%mm7, %%mm4 \n\t"
2265 "punpcklbw %%mm7, %%mm1 \n\t"
2266 "punpcklbw %%mm7, %%mm2 \n\t"
2267 "punpcklbw %%mm7, %%mm3 \n\t"
2268 "paddw %%mm1, %%mm4 \n\t"
2269 "paddw %%mm3, %%mm2 \n\t"
2270 "paddw %%mm4, %%mm2 \n\t"
2271 "psrlw $2, %%mm0 \n\t"
2272 "psrlw $2, %%mm2 \n\t"
2273#endif
2274 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2275 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2276
2277 "pmaddwd %%mm0, %%mm1 \n\t"
2278 "pmaddwd %%mm2, %%mm3 \n\t"
2279 "pmaddwd %%mm6, %%mm0 \n\t"
2280 "pmaddwd %%mm6, %%mm2 \n\t"
2281#ifndef FAST_BGR2YV12
2282 "psrad $8, %%mm0 \n\t"
2283 "psrad $8, %%mm1 \n\t"
2284 "psrad $8, %%mm2 \n\t"
2285 "psrad $8, %%mm3 \n\t"
2286#endif
2287 "packssdw %%mm2, %%mm0 \n\t"
2288 "packssdw %%mm3, %%mm1 \n\t"
2289 "pmaddwd %%mm5, %%mm0 \n\t"
2290 "pmaddwd %%mm5, %%mm1 \n\t"
2291 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2292 "psraw $7, %%mm0 \n\t"
2293
2294#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2295 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2296 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2297 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2298 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2299 PAVGB" %%mm1, %%mm4 \n\t"
2300 PAVGB" %%mm3, %%mm2 \n\t"
2301 "movq %%mm4, %%mm1 \n\t"
2302 "movq %%mm2, %%mm3 \n\t"
2303 "psrlq $24, %%mm4 \n\t"
2304 "psrlq $24, %%mm2 \n\t"
2305 PAVGB" %%mm1, %%mm4 \n\t"
2306 PAVGB" %%mm3, %%mm2 \n\t"
2307 "punpcklbw %%mm7, %%mm4 \n\t"
2308 "punpcklbw %%mm7, %%mm2 \n\t"
2309#else
2310 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2311 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2312 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2313 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2314 "punpcklbw %%mm7, %%mm4 \n\t"
2315 "punpcklbw %%mm7, %%mm1 \n\t"
2316 "punpcklbw %%mm7, %%mm2 \n\t"
2317 "punpcklbw %%mm7, %%mm3 \n\t"
2318 "paddw %%mm1, %%mm4 \n\t"
2319 "paddw %%mm3, %%mm2 \n\t"
2320 "paddw %%mm2, %%mm4 \n\t"
2321 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2322 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2323 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2324 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2325 "punpcklbw %%mm7, %%mm5 \n\t"
2326 "punpcklbw %%mm7, %%mm1 \n\t"
2327 "punpcklbw %%mm7, %%mm2 \n\t"
2328 "punpcklbw %%mm7, %%mm3 \n\t"
2329 "paddw %%mm1, %%mm5 \n\t"
2330 "paddw %%mm3, %%mm2 \n\t"
2331 "paddw %%mm5, %%mm2 \n\t"
2332 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2333 "psrlw $2, %%mm4 \n\t"
2334 "psrlw $2, %%mm2 \n\t"
2335#endif
2336 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2337 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2338
2339 "pmaddwd %%mm4, %%mm1 \n\t"
2340 "pmaddwd %%mm2, %%mm3 \n\t"
2341 "pmaddwd %%mm6, %%mm4 \n\t"
2342 "pmaddwd %%mm6, %%mm2 \n\t"
2343#ifndef FAST_BGR2YV12
2344 "psrad $8, %%mm4 \n\t"
2345 "psrad $8, %%mm1 \n\t"
2346 "psrad $8, %%mm2 \n\t"
2347 "psrad $8, %%mm3 \n\t"
2348#endif
2349 "packssdw %%mm2, %%mm4 \n\t"
2350 "packssdw %%mm3, %%mm1 \n\t"
2351 "pmaddwd %%mm5, %%mm4 \n\t"
2352 "pmaddwd %%mm5, %%mm1 \n\t"
2353 "add $24, %%"REG_d" \n\t"
2354 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2355 "psraw $7, %%mm4 \n\t"
2356
2357 "movq %%mm0, %%mm1 \n\t"
2358 "punpckldq %%mm4, %%mm0 \n\t"
2359 "punpckhdq %%mm4, %%mm1 \n\t"
2360 "packsswb %%mm1, %%mm0 \n\t"
2361 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2362 "movd %%mm0, (%2, %%"REG_a") \n\t"
2363 "punpckhdq %%mm0, %%mm0 \n\t"
2364 "movd %%mm0, (%3, %%"REG_a") \n\t"
2365 "add $4, %%"REG_a" \n\t"
2366 " js 1b \n\t"
2367 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2368 : "%"REG_a, "%"REG_d
2369 );
2370
2371 udst += chromStride;
2372 vdst += chromStride;
2373 src += srcStride*2;
2374 }
2375
2376 asm volatile( EMMS" \n\t"
2377 SFENCE" \n\t"
2378 :::"memory");
2379#else
2380 y=0;
2381#endif
2382 for (; y<height; y+=2)
2383 {
2384 long i;
2385 for (i=0; i<chromWidth; i++)
2386 {
2387 unsigned int b = src[6*i+0];
2388 unsigned int g = src[6*i+1];
2389 unsigned int r = src[6*i+2];
2390
2391 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2392 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2393 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2394
2395 udst[i] = U;
2396 vdst[i] = V;
2397 ydst[2*i] = Y;
2398
2399 b = src[6*i+3];
2400 g = src[6*i+4];
2401 r = src[6*i+5];
2402
2403 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2404 ydst[2*i+1] = Y;
2405 }
2406 ydst += lumStride;
2407 src += srcStride;
2408
2409 for (i=0; i<chromWidth; i++)
2410 {
2411 unsigned int b = src[6*i+0];
2412 unsigned int g = src[6*i+1];
2413 unsigned int r = src[6*i+2];
2414
2415 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2416
2417 ydst[2*i] = Y;
2418
2419 b = src[6*i+3];
2420 g = src[6*i+4];
2421 r = src[6*i+5];
2422
2423 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2424 ydst[2*i+1] = Y;
2425 }
2426 udst += chromStride;
2427 vdst += chromStride;
2428 ydst += lumStride;
2429 src += srcStride;
2430 }
2431}
2432
2433void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2434 long width, long height, long src1Stride,
2435 long src2Stride, long dstStride){
2436 long h;
2437
2438 for (h=0; h < height; h++)
2439 {
2440 long w;
2441
2442#ifdef HAVE_MMX
2443#ifdef HAVE_SSE2
2444 asm(
2445 "xor %%"REG_a", %%"REG_a" \n\t"
2446 "1: \n\t"
2447 PREFETCH" 64(%1, %%"REG_a") \n\t"
2448 PREFETCH" 64(%2, %%"REG_a") \n\t"
2449 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2450 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2451 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2452 "punpcklbw %%xmm2, %%xmm0 \n\t"
2453 "punpckhbw %%xmm2, %%xmm1 \n\t"
2454 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2455 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2456 "add $16, %%"REG_a" \n\t"
2457 "cmp %3, %%"REG_a" \n\t"
2458 " jb 1b \n\t"
2459 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2460 : "memory", "%"REG_a""
2461 );
2462#else
2463 asm(
2464 "xor %%"REG_a", %%"REG_a" \n\t"
2465 "1: \n\t"
2466 PREFETCH" 64(%1, %%"REG_a") \n\t"
2467 PREFETCH" 64(%2, %%"REG_a") \n\t"
2468 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2469 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2470 "movq %%mm0, %%mm1 \n\t"
2471 "movq %%mm2, %%mm3 \n\t"
2472 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2473 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2474 "punpcklbw %%mm4, %%mm0 \n\t"
2475 "punpckhbw %%mm4, %%mm1 \n\t"
2476 "punpcklbw %%mm5, %%mm2 \n\t"
2477 "punpckhbw %%mm5, %%mm3 \n\t"
2478 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2479 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2480 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2481 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2482 "add $16, %%"REG_a" \n\t"
2483 "cmp %3, %%"REG_a" \n\t"
2484 " jb 1b \n\t"
2485 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2486 : "memory", "%"REG_a
2487 );
2488#endif
2489 for (w= (width&(~15)); w < width; w++)
2490 {
2491 dest[2*w+0] = src1[w];
2492 dest[2*w+1] = src2[w];
2493 }
2494#else
2495 for (w=0; w < width; w++)
2496 {
2497 dest[2*w+0] = src1[w];
2498 dest[2*w+1] = src2[w];
2499 }
2500#endif
2501 dest += dstStride;
2502 src1 += src1Stride;
2503 src2 += src2Stride;
2504 }
2505#ifdef HAVE_MMX
2506 asm(
2507 EMMS" \n\t"
2508 SFENCE" \n\t"
2509 ::: "memory"
2510 );
2511#endif
2512}
2513
2514static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2515 uint8_t *dst1, uint8_t *dst2,
2516 long width, long height,
2517 long srcStride1, long srcStride2,
2518 long dstStride1, long dstStride2)
2519{
2520 long y,x,w,h;
2521 w=width/2; h=height/2;
2522#ifdef HAVE_MMX
2523 asm volatile(
2524 PREFETCH" %0 \n\t"
2525 PREFETCH" %1 \n\t"
2526 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2527#endif
2528 for (y=0;y<h;y++){
2529 const uint8_t* s1=src1+srcStride1*(y>>1);
2530 uint8_t* d=dst1+dstStride1*y;
2531 x=0;
2532#ifdef HAVE_MMX
2533 for (;x<w-31;x+=32)
2534 {
2535 asm volatile(
2536 PREFETCH" 32%1 \n\t"
2537 "movq %1, %%mm0 \n\t"
2538 "movq 8%1, %%mm2 \n\t"
2539 "movq 16%1, %%mm4 \n\t"
2540 "movq 24%1, %%mm6 \n\t"
2541 "movq %%mm0, %%mm1 \n\t"
2542 "movq %%mm2, %%mm3 \n\t"
2543 "movq %%mm4, %%mm5 \n\t"
2544 "movq %%mm6, %%mm7 \n\t"
2545 "punpcklbw %%mm0, %%mm0 \n\t"
2546 "punpckhbw %%mm1, %%mm1 \n\t"
2547 "punpcklbw %%mm2, %%mm2 \n\t"
2548 "punpckhbw %%mm3, %%mm3 \n\t"
2549 "punpcklbw %%mm4, %%mm4 \n\t"
2550 "punpckhbw %%mm5, %%mm5 \n\t"
2551 "punpcklbw %%mm6, %%mm6 \n\t"
2552 "punpckhbw %%mm7, %%mm7 \n\t"
2553 MOVNTQ" %%mm0, %0 \n\t"
2554 MOVNTQ" %%mm1, 8%0 \n\t"
2555 MOVNTQ" %%mm2, 16%0 \n\t"
2556 MOVNTQ" %%mm3, 24%0 \n\t"
2557 MOVNTQ" %%mm4, 32%0 \n\t"
2558 MOVNTQ" %%mm5, 40%0 \n\t"
2559 MOVNTQ" %%mm6, 48%0 \n\t"
2560 MOVNTQ" %%mm7, 56%0"
2561 :"=m"(d[2*x])
2562 :"m"(s1[x])
2563 :"memory");
2564 }
2565#endif
2566 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2567 }
2568 for (y=0;y<h;y++){
2569 const uint8_t* s2=src2+srcStride2*(y>>1);
2570 uint8_t* d=dst2+dstStride2*y;
2571 x=0;
2572#ifdef HAVE_MMX
2573 for (;x<w-31;x+=32)
2574 {
2575 asm volatile(
2576 PREFETCH" 32%1 \n\t"
2577 "movq %1, %%mm0 \n\t"
2578 "movq 8%1, %%mm2 \n\t"
2579 "movq 16%1, %%mm4 \n\t"
2580 "movq 24%1, %%mm6 \n\t"
2581 "movq %%mm0, %%mm1 \n\t"
2582 "movq %%mm2, %%mm3 \n\t"
2583 "movq %%mm4, %%mm5 \n\t"
2584 "movq %%mm6, %%mm7 \n\t"
2585 "punpcklbw %%mm0, %%mm0 \n\t"
2586 "punpckhbw %%mm1, %%mm1 \n\t"
2587 "punpcklbw %%mm2, %%mm2 \n\t"
2588 "punpckhbw %%mm3, %%mm3 \n\t"
2589 "punpcklbw %%mm4, %%mm4 \n\t"
2590 "punpckhbw %%mm5, %%mm5 \n\t"
2591 "punpcklbw %%mm6, %%mm6 \n\t"
2592 "punpckhbw %%mm7, %%mm7 \n\t"
2593 MOVNTQ" %%mm0, %0 \n\t"
2594 MOVNTQ" %%mm1, 8%0 \n\t"
2595 MOVNTQ" %%mm2, 16%0 \n\t"
2596 MOVNTQ" %%mm3, 24%0 \n\t"
2597 MOVNTQ" %%mm4, 32%0 \n\t"
2598 MOVNTQ" %%mm5, 40%0 \n\t"
2599 MOVNTQ" %%mm6, 48%0 \n\t"
2600 MOVNTQ" %%mm7, 56%0"
2601 :"=m"(d[2*x])
2602 :"m"(s2[x])
2603 :"memory");
2604 }
2605#endif
2606 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2607 }
2608#ifdef HAVE_MMX
2609 asm(
2610 EMMS" \n\t"
2611 SFENCE" \n\t"
2612 ::: "memory"
2613 );
2614#endif
2615}
2616
2617static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2618 uint8_t *dst,
2619 long width, long height,
2620 long srcStride1, long srcStride2,
2621 long srcStride3, long dstStride)
2622{
2623 long y,x,w,h;
2624 w=width/2; h=height;
2625 for (y=0;y<h;y++){
2626 const uint8_t* yp=src1+srcStride1*y;
2627 const uint8_t* up=src2+srcStride2*(y>>2);
2628 const uint8_t* vp=src3+srcStride3*(y>>2);
2629 uint8_t* d=dst+dstStride*y;
2630 x=0;
2631#ifdef HAVE_MMX
2632 for (;x<w-7;x+=8)
2633 {
2634 asm volatile(
2635 PREFETCH" 32(%1, %0) \n\t"
2636 PREFETCH" 32(%2, %0) \n\t"
2637 PREFETCH" 32(%3, %0) \n\t"
2638 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2639 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2640 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2641 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2642 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2643 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2644 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2645 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2646 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2647 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2648
2649 "movq %%mm1, %%mm6 \n\t"
2650 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2651 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2652 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2653 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2654 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2655
2656 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2657 "movq 8(%1, %0, 4), %%mm0 \n\t"
2658 "movq %%mm0, %%mm3 \n\t"
2659 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2660 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2661 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2662 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2663
2664 "movq %%mm4, %%mm6 \n\t"
2665 "movq 16(%1, %0, 4), %%mm0 \n\t"
2666 "movq %%mm0, %%mm3 \n\t"
2667 "punpcklbw %%mm5, %%mm4 \n\t"
2668 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2669 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2670 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2671 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2672
2673 "punpckhbw %%mm5, %%mm6 \n\t"
2674 "movq 24(%1, %0, 4), %%mm0 \n\t"
2675 "movq %%mm0, %%mm3 \n\t"
2676 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2677 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2678 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2679 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2680
2681 : "+r" (x)
2682 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2683 :"memory");
2684 }
2685#endif
2686 for (; x<w; x++)
2687 {
2688 const long x2 = x<<2;
2689 d[8*x+0] = yp[x2];
2690 d[8*x+1] = up[x];
2691 d[8*x+2] = yp[x2+1];
2692 d[8*x+3] = vp[x];
2693 d[8*x+4] = yp[x2+2];
2694 d[8*x+5] = up[x];
2695 d[8*x+6] = yp[x2+3];
2696 d[8*x+7] = vp[x];
2697 }
2698 }
2699#ifdef HAVE_MMX
2700 asm(
2701 EMMS" \n\t"
2702 SFENCE" \n\t"
2703 ::: "memory"
2704 );
2705#endif
2706}
2707
2708static inline void RENAME(rgb2rgb_init)(void){
2709 rgb15to16 = RENAME(rgb15to16);
2710 rgb15to24 = RENAME(rgb15to24);
2711 rgb15to32 = RENAME(rgb15to32);
2712 rgb16to24 = RENAME(rgb16to24);
2713 rgb16to32 = RENAME(rgb16to32);
2714 rgb16to15 = RENAME(rgb16to15);
2715 rgb24to16 = RENAME(rgb24to16);
2716 rgb24to15 = RENAME(rgb24to15);
2717 rgb24to32 = RENAME(rgb24to32);
2718 rgb32to16 = RENAME(rgb32to16);
2719 rgb32to15 = RENAME(rgb32to15);
2720 rgb32to24 = RENAME(rgb32to24);
2721 rgb24tobgr15 = RENAME(rgb24tobgr15);
2722 rgb24tobgr16 = RENAME(rgb24tobgr16);
2723 rgb24tobgr24 = RENAME(rgb24tobgr24);
2724 rgb32tobgr32 = RENAME(rgb32tobgr32);
2725 rgb32tobgr16 = RENAME(rgb32tobgr16);
2726 rgb32tobgr15 = RENAME(rgb32tobgr15);
2727 yv12toyuy2 = RENAME(yv12toyuy2);
2728 yv12touyvy = RENAME(yv12touyvy);
2729 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2730 yuy2toyv12 = RENAME(yuy2toyv12);
2731// uyvytoyv12 = RENAME(uyvytoyv12);
2732// yvu9toyv12 = RENAME(yvu9toyv12);
2733 planar2x = RENAME(planar2x);
2734 rgb24toyv12 = RENAME(rgb24toyv12);
2735 interleaveBytes = RENAME(interleaveBytes);
2736 vu9_to_vu12 = RENAME(vu9_to_vu12);
2737 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2738}
diff --git a/src/plugins/ffmpeg/libswscale/swscale-example.c b/src/plugins/ffmpeg/libswscale/swscale-example.c
deleted file mode 100644
index bc2a8bf..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale-example.c
+++ /dev/null
@@ -1,230 +0,0 @@
1/*
2 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <inttypes.h>
25#include <stdarg.h>
26
27#undef HAVE_AV_CONFIG_H
28#include "libavutil/avutil.h"
29#include "swscale.h"
30#include "swscale_internal.h"
31#include "rgb2rgb.h"
32
33static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
34 int x,y;
35 uint64_t ssd=0;
36
37//printf("%d %d\n", w, h);
38
39 for (y=0; y<h; y++){
40 for (x=0; x<w; x++){
41 int d= src1[x + y*stride1] - src2[x + y*stride2];
42 ssd+= d*d;
43//printf("%d", abs(src1[x + y*stride1] - src2[x + y*stride2])/26 );
44 }
45//printf("\n");
46 }
47 return ssd;
48}
49
50// test by ref -> src -> dst -> out & compare out against ref
51// ref & out are YV12
52static int doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat,
53 int srcW, int srcH, int dstW, int dstH, int flags){
54 uint8_t *src[3];
55 uint8_t *dst[3];
56 uint8_t *out[3];
57 int srcStride[3], dstStride[3];
58 int i;
59 uint64_t ssdY, ssdU, ssdV;
60 struct SwsContext *srcContext, *dstContext, *outContext;
61 int res;
62
63 res = 0;
64 for (i=0; i<3; i++){
65 // avoid stride % bpp != 0
66 if (srcFormat==PIX_FMT_RGB24 || srcFormat==PIX_FMT_BGR24)
67 srcStride[i]= srcW*3;
68 else
69 srcStride[i]= srcW*4;
70
71 if (dstFormat==PIX_FMT_RGB24 || dstFormat==PIX_FMT_BGR24)
72 dstStride[i]= dstW*3;
73 else
74 dstStride[i]= dstW*4;
75
76 src[i]= (uint8_t*) malloc(srcStride[i]*srcH);
77 dst[i]= (uint8_t*) malloc(dstStride[i]*dstH);
78 out[i]= (uint8_t*) malloc(refStride[i]*h);
79 if (!src[i] || !dst[i] || !out[i]) {
80 perror("Malloc");
81 res = -1;
82
83 goto end;
84 }
85 }
86
87 dstContext = outContext = NULL;
88 srcContext= sws_getContext(w, h, PIX_FMT_YUV420P, srcW, srcH, srcFormat, flags, NULL, NULL, NULL);
89 if (!srcContext) {
90 fprintf(stderr, "Failed to get %s ---> %s\n",
91 sws_format_name(PIX_FMT_YUV420P),
92 sws_format_name(srcFormat));
93 res = -1;
94
95 goto end;
96 }
97 dstContext= sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL, NULL);
98 if (!dstContext) {
99 fprintf(stderr, "Failed to get %s ---> %s\n",
100 sws_format_name(srcFormat),
101 sws_format_name(dstFormat));
102 res = -1;
103
104 goto end;
105 }
106 outContext= sws_getContext(dstW, dstH, dstFormat, w, h, PIX_FMT_YUV420P, flags, NULL, NULL, NULL);
107 if (!outContext) {
108 fprintf(stderr, "Failed to get %s ---> %s\n",
109 sws_format_name(dstFormat),
110 sws_format_name(PIX_FMT_YUV420P));
111 res = -1;
112
113 goto end;
114 }
115// printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
116// (int)src[0], (int)src[1], (int)src[2]);
117
118 sws_scale(srcContext, ref, refStride, 0, h , src, srcStride);
119 sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
120 sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride);
121
122#if defined(ARCH_X86)
123 asm volatile ("emms\n\t");
124#endif
125
126 ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
127 ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
128 ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
129
130 if (srcFormat == PIX_FMT_GRAY8 || dstFormat==PIX_FMT_GRAY8) ssdU=ssdV=0; //FIXME check that output is really gray
131
132 ssdY/= w*h;
133 ssdU/= w*h/4;
134 ssdV/= w*h/4;
135
136 if (ssdY>100 || ssdU>100 || ssdV>100){
137 printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n",
138 sws_format_name(srcFormat), srcW, srcH,
139 sws_format_name(dstFormat), dstW, dstH,
140 flags,
141 ssdY, ssdU, ssdV);
142 }
143
144 end:
145
146 sws_freeContext(srcContext);
147 sws_freeContext(dstContext);
148 sws_freeContext(outContext);
149
150 for (i=0; i<3; i++){
151 free(src[i]);
152 free(dst[i]);
153 free(out[i]);
154 }
155
156 return res;
157}
158
159void fast_memcpy(void *a, void *b, int s){ //FIXME
160 memcpy(a, b, s);
161}
162
163static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
164 enum PixelFormat srcFormat, dstFormat;
165 int srcW, srcH, dstW, dstH;
166 int flags;
167
168 for (srcFormat = 0; srcFormat < PIX_FMT_NB; srcFormat++) {
169 for (dstFormat = 0; dstFormat < PIX_FMT_NB; dstFormat++) {
170 printf("%s -> %s\n",
171 sws_format_name(srcFormat),
172 sws_format_name(dstFormat));
173
174 srcW= w;
175 srcH= h;
176 for (dstW=w - w/3; dstW<= 4*w/3; dstW+= w/3){
177 for (dstH=h - h/3; dstH<= 4*h/3; dstH+= h/3){
178 for (flags=1; flags<33; flags*=2) {
179 int res;
180
181 res = doTest(src, stride, w, h, srcFormat, dstFormat,
182 srcW, srcH, dstW, dstH, flags);
183 if (res < 0) {
184 dstW = 4 * w / 3;
185 dstH = 4 * h / 3;
186 flags = 33;
187 }
188 }
189 }
190 }
191 }
192 }
193}
194
195#define W 96
196#define H 96
197
198int main(int argc, char **argv){
199 uint8_t *rgb_data = malloc (W*H*4);
200 uint8_t *rgb_src[3]= {rgb_data, NULL, NULL};
201 int rgb_stride[3]={4*W, 0, 0};
202 uint8_t *data = malloc (3*W*H);
203 uint8_t *src[3]= {data, data+W*H, data+W*H*2};
204 int stride[3]={W, W, W};
205 int x, y;
206 struct SwsContext *sws;
207
208 sws= sws_getContext(W/12, H/12, PIX_FMT_RGB32, W, H, PIX_FMT_YUV420P, 2, NULL, NULL, NULL);
209
210 for (y=0; y<H; y++){
211 for (x=0; x<W*4; x++){
212 rgb_data[ x + y*4*W]= random();
213 }
214 }
215#if defined(ARCH_X86)
216 sws_rgb2rgb_init(SWS_CPU_CAPS_MMX*0);
217#else
218 sws_rgb2rgb_init(0);
219#endif
220 sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
221
222#if defined(ARCH_X86)
223 asm volatile ("emms\n\t");
224#endif
225
226 selfTest(src, stride, W, H);
227 free (rgb_data);
228 free (data);
229 return 123;
230}
diff --git a/src/plugins/ffmpeg/libswscale/swscale.c b/src/plugins/ffmpeg/libswscale/swscale.c
deleted file mode 100644
index f6c2f76..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale.c
+++ /dev/null
@@ -1,2934 +0,0 @@
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * the C code (not assembly, mmx, ...) of this file can be used
21 * under the LGPL license too
22 */
23
24/*
25 supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09, PAL8
26 supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
27 {BGR,RGB}{1,4,8,15,16} support dithering
28
29 unscaled special converters (YV12=I420=IYUV, Y800=Y8)
30 YV12 -> {BGR,RGB}{1,4,8,15,16,24,32}
31 x -> x
32 YUV9 -> YV12
33 YUV9/YV12 -> Y800
34 Y800 -> YUV9/YV12
35 BGR24 -> BGR32 & RGB24 -> RGB32
36 BGR32 -> BGR24 & RGB32 -> RGB24
37 BGR15 -> BGR16
38*/
39
40/*
41tested special converters (most are tested actually, but I did not write it down ...)
42 YV12 -> BGR16
43 YV12 -> YV12
44 BGR15 -> BGR16
45 BGR16 -> BGR16
46 YVU9 -> YV12
47
48untested special converters
49 YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be ok)
50 YV12/I420 -> YV12/I420
51 YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
52 BGR24 -> BGR32 & RGB24 -> RGB32
53 BGR32 -> BGR24 & RGB32 -> RGB24
54 BGR24 -> YV12
55*/
56
57#include <inttypes.h>
58#include <string.h>
59#include <math.h>
60#include <stdio.h>
61#include <unistd.h>
62#include "config.h"
63#include <assert.h>
64#ifdef HAVE_SYS_MMAN_H
65#include <sys/mman.h>
66#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
67#define MAP_ANONYMOUS MAP_ANON
68#endif
69#endif
70#include "swscale.h"
71#include "swscale_internal.h"
72#include "rgb2rgb.h"
73#include "libavutil/x86_cpu.h"
74#include "libavutil/bswap.h"
75
76#undef MOVNTQ
77#undef PAVGB
78
79//#undef HAVE_MMX2
80//#define HAVE_3DNOW
81//#undef HAVE_MMX
82//#undef ARCH_X86
83//#define WORDS_BIGENDIAN
84#define DITHER1XBPP
85
86#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
87
88#define RET 0xC3 //near return opcode for X86
89
90#ifdef M_PI
91#define PI M_PI
92#else
93#define PI 3.14159265358979323846
94#endif
95
96#define isSupportedIn(x) ( \
97 (x)==PIX_FMT_YUV420P \
98 || (x)==PIX_FMT_YUVA420P \
99 || (x)==PIX_FMT_YUYV422 \
100 || (x)==PIX_FMT_UYVY422 \
101 || (x)==PIX_FMT_RGB32 \
102 || (x)==PIX_FMT_BGR24 \
103 || (x)==PIX_FMT_BGR565 \
104 || (x)==PIX_FMT_BGR555 \
105 || (x)==PIX_FMT_BGR32 \
106 || (x)==PIX_FMT_RGB24 \
107 || (x)==PIX_FMT_RGB565 \
108 || (x)==PIX_FMT_RGB555 \
109 || (x)==PIX_FMT_GRAY8 \
110 || (x)==PIX_FMT_YUV410P \
111 || (x)==PIX_FMT_GRAY16BE \
112 || (x)==PIX_FMT_GRAY16LE \
113 || (x)==PIX_FMT_YUV444P \
114 || (x)==PIX_FMT_YUV422P \
115 || (x)==PIX_FMT_YUV411P \
116 || (x)==PIX_FMT_PAL8 \
117 || (x)==PIX_FMT_BGR8 \
118 || (x)==PIX_FMT_RGB8 \
119 || (x)==PIX_FMT_BGR4_BYTE \
120 || (x)==PIX_FMT_RGB4_BYTE \
121 || (x)==PIX_FMT_YUV440P \
122 )
123#define isSupportedOut(x) ( \
124 (x)==PIX_FMT_YUV420P \
125 || (x)==PIX_FMT_YUYV422 \
126 || (x)==PIX_FMT_UYVY422 \
127 || (x)==PIX_FMT_YUV444P \
128 || (x)==PIX_FMT_YUV422P \
129 || (x)==PIX_FMT_YUV411P \
130 || isRGB(x) \
131 || isBGR(x) \
132 || (x)==PIX_FMT_NV12 \
133 || (x)==PIX_FMT_NV21 \
134 || (x)==PIX_FMT_GRAY16BE \
135 || (x)==PIX_FMT_GRAY16LE \
136 || (x)==PIX_FMT_GRAY8 \
137 || (x)==PIX_FMT_YUV410P \
138 )
139#define isPacked(x) ( \
140 (x)==PIX_FMT_PAL8 \
141 || (x)==PIX_FMT_YUYV422 \
142 || (x)==PIX_FMT_UYVY422 \
143 || isRGB(x) \
144 || isBGR(x) \
145 )
146
147#define RGB2YUV_SHIFT 16
148#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
149#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
150#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
151#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
152#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
153#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
154#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
155#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
156#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
157
158extern const int32_t Inverse_Table_6_9[8][4];
159
160/*
161NOTES
162Special versions: fast Y 1:1 scaling (no interpolation in y direction)
163
164TODO
165more intelligent misalignment avoidance for the horizontal scaler
166write special vertical cubic upscale version
167Optimize C code (yv12 / minmax)
168add support for packed pixel yuv input & output
169add support for Y8 output
170optimize bgr24 & bgr32
171add BGR4 output support
172write special BGR->BGR scaler
173*/
174
175#if defined(ARCH_X86) && defined (CONFIG_GPL)
176DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
177DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
178DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
179DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
180DECLARE_ASM_CONST(8, uint64_t, bm00001111)=0x00000000FFFFFFFFLL;
181DECLARE_ASM_CONST(8, uint64_t, bm00000111)=0x0000000000FFFFFFLL;
182DECLARE_ASM_CONST(8, uint64_t, bm11111000)=0xFFFFFFFFFF000000LL;
183DECLARE_ASM_CONST(8, uint64_t, bm01010101)=0x00FF00FF00FF00FFLL;
184
185static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
186static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
187static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
188static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
189
190const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]) = {
191 0x0103010301030103LL,
192 0x0200020002000200LL,};
193
194const DECLARE_ALIGNED(8, uint64_t, ff_dither8[2]) = {
195 0x0602060206020602LL,
196 0x0004000400040004LL,};
197
198DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
199DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
200DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
201DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
202DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
203DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
204
205DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
206DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
207DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
208
209#ifdef FAST_BGR2YV12
210DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL;
211DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL;
212DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL;
213#else
214DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL;
215DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL;
216DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
217#endif /* FAST_BGR2YV12 */
218DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
219DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
220DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
221#endif /* defined(ARCH_X86) */
222
223// clipping helper table for C implementations:
224static unsigned char clip_table[768];
225
226static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b);
227
228extern const uint8_t dither_2x2_4[2][8];
229extern const uint8_t dither_2x2_8[2][8];
230extern const uint8_t dither_8x8_32[8][8];
231extern const uint8_t dither_8x8_73[8][8];
232extern const uint8_t dither_8x8_220[8][8];
233
234const char *sws_format_name(enum PixelFormat format)
235{
236 switch (format) {
237 case PIX_FMT_YUV420P:
238 return "yuv420p";
239 case PIX_FMT_YUVA420P:
240 return "yuva420p";
241 case PIX_FMT_YUYV422:
242 return "yuyv422";
243 case PIX_FMT_RGB24:
244 return "rgb24";
245 case PIX_FMT_BGR24:
246 return "bgr24";
247 case PIX_FMT_YUV422P:
248 return "yuv422p";
249 case PIX_FMT_YUV444P:
250 return "yuv444p";
251 case PIX_FMT_RGB32:
252 return "rgb32";
253 case PIX_FMT_YUV410P:
254 return "yuv410p";
255 case PIX_FMT_YUV411P:
256 return "yuv411p";
257 case PIX_FMT_RGB565:
258 return "rgb565";
259 case PIX_FMT_RGB555:
260 return "rgb555";
261 case PIX_FMT_GRAY16BE:
262 return "gray16be";
263 case PIX_FMT_GRAY16LE:
264 return "gray16le";
265 case PIX_FMT_GRAY8:
266 return "gray8";
267 case PIX_FMT_MONOWHITE:
268 return "mono white";
269 case PIX_FMT_MONOBLACK:
270 return "mono black";
271 case PIX_FMT_PAL8:
272 return "Palette";
273 case PIX_FMT_YUVJ420P:
274 return "yuvj420p";
275 case PIX_FMT_YUVJ422P:
276 return "yuvj422p";
277 case PIX_FMT_YUVJ444P:
278 return "yuvj444p";
279 case PIX_FMT_XVMC_MPEG2_MC:
280 return "xvmc_mpeg2_mc";
281 case PIX_FMT_XVMC_MPEG2_IDCT:
282 return "xvmc_mpeg2_idct";
283 case PIX_FMT_UYVY422:
284 return "uyvy422";
285 case PIX_FMT_UYYVYY411:
286 return "uyyvyy411";
287 case PIX_FMT_RGB32_1:
288 return "rgb32x";
289 case PIX_FMT_BGR32_1:
290 return "bgr32x";
291 case PIX_FMT_BGR32:
292 return "bgr32";
293 case PIX_FMT_BGR565:
294 return "bgr565";
295 case PIX_FMT_BGR555:
296 return "bgr555";
297 case PIX_FMT_BGR8:
298 return "bgr8";
299 case PIX_FMT_BGR4:
300 return "bgr4";
301 case PIX_FMT_BGR4_BYTE:
302 return "bgr4 byte";
303 case PIX_FMT_RGB8:
304 return "rgb8";
305 case PIX_FMT_RGB4:
306 return "rgb4";
307 case PIX_FMT_RGB4_BYTE:
308 return "rgb4 byte";
309 case PIX_FMT_NV12:
310 return "nv12";
311 case PIX_FMT_NV21:
312 return "nv21";
313 case PIX_FMT_YUV440P:
314 return "yuv440p";
315 default:
316 return "Unknown format";
317 }
318}
319
320static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
321 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
322 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
323{
324 //FIXME Optimize (just quickly writen not opti..)
325 int i;
326 for (i=0; i<dstW; i++)
327 {
328 int val=1<<18;
329 int j;
330 for (j=0; j<lumFilterSize; j++)
331 val += lumSrc[j][i] * lumFilter[j];
332
333 dest[i]= av_clip_uint8(val>>19);
334 }
335
336 if (uDest)
337 for (i=0; i<chrDstW; i++)
338 {
339 int u=1<<18;
340 int v=1<<18;
341 int j;
342 for (j=0; j<chrFilterSize; j++)
343 {
344 u += chrSrc[j][i] * chrFilter[j];
345 v += chrSrc[j][i + VOFW] * chrFilter[j];
346 }
347
348 uDest[i]= av_clip_uint8(u>>19);
349 vDest[i]= av_clip_uint8(v>>19);
350 }
351}
352
353static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
354 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
355 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
356{
357 //FIXME Optimize (just quickly writen not opti..)
358 int i;
359 for (i=0; i<dstW; i++)
360 {
361 int val=1<<18;
362 int j;
363 for (j=0; j<lumFilterSize; j++)
364 val += lumSrc[j][i] * lumFilter[j];
365
366 dest[i]= av_clip_uint8(val>>19);
367 }
368
369 if (!uDest)
370 return;
371
372 if (dstFormat == PIX_FMT_NV12)
373 for (i=0; i<chrDstW; i++)
374 {
375 int u=1<<18;
376 int v=1<<18;
377 int j;
378 for (j=0; j<chrFilterSize; j++)
379 {
380 u += chrSrc[j][i] * chrFilter[j];
381 v += chrSrc[j][i + VOFW] * chrFilter[j];
382 }
383
384 uDest[2*i]= av_clip_uint8(u>>19);
385 uDest[2*i+1]= av_clip_uint8(v>>19);
386 }
387 else
388 for (i=0; i<chrDstW; i++)
389 {
390 int u=1<<18;
391 int v=1<<18;
392 int j;
393 for (j=0; j<chrFilterSize; j++)
394 {
395 u += chrSrc[j][i] * chrFilter[j];
396 v += chrSrc[j][i + VOFW] * chrFilter[j];
397 }
398
399 uDest[2*i]= av_clip_uint8(v>>19);
400 uDest[2*i+1]= av_clip_uint8(u>>19);
401 }
402}
403
404#define YSCALE_YUV_2_PACKEDX_C(type) \
405 for (i=0; i<(dstW>>1); i++){\
406 int j;\
407 int Y1 = 1<<18;\
408 int Y2 = 1<<18;\
409 int U = 1<<18;\
410 int V = 1<<18;\
411 type av_unused *r, *b, *g;\
412 const int i2= 2*i;\
413 \
414 for (j=0; j<lumFilterSize; j++)\
415 {\
416 Y1 += lumSrc[j][i2] * lumFilter[j];\
417 Y2 += lumSrc[j][i2+1] * lumFilter[j];\
418 }\
419 for (j=0; j<chrFilterSize; j++)\
420 {\
421 U += chrSrc[j][i] * chrFilter[j];\
422 V += chrSrc[j][i+VOFW] * chrFilter[j];\
423 }\
424 Y1>>=19;\
425 Y2>>=19;\
426 U >>=19;\
427 V >>=19;\
428 if ((Y1|Y2|U|V)&256)\
429 {\
430 if (Y1>255) Y1=255; \
431 else if (Y1<0)Y1=0; \
432 if (Y2>255) Y2=255; \
433 else if (Y2<0)Y2=0; \
434 if (U>255) U=255; \
435 else if (U<0) U=0; \
436 if (V>255) V=255; \
437 else if (V<0) V=0; \
438 }
439
440#define YSCALE_YUV_2_RGBX_C(type) \
441 YSCALE_YUV_2_PACKEDX_C(type) \
442 r = (type *)c->table_rV[V]; \
443 g = (type *)(c->table_gU[U] + c->table_gV[V]); \
444 b = (type *)c->table_bU[U]; \
445
446#define YSCALE_YUV_2_PACKED2_C \
447 for (i=0; i<(dstW>>1); i++){ \
448 const int i2= 2*i; \
449 int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19; \
450 int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19; \
451 int U= (uvbuf0[i ]*uvalpha1+uvbuf1[i ]*uvalpha)>>19; \
452 int V= (uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19; \
453
454#define YSCALE_YUV_2_RGB2_C(type) \
455 YSCALE_YUV_2_PACKED2_C\
456 type *r, *b, *g;\
457 r = (type *)c->table_rV[V];\
458 g = (type *)(c->table_gU[U] + c->table_gV[V]);\
459 b = (type *)c->table_bU[U];\
460
461#define YSCALE_YUV_2_PACKED1_C \
462 for (i=0; i<(dstW>>1); i++){\
463 const int i2= 2*i;\
464 int Y1= buf0[i2 ]>>7;\
465 int Y2= buf0[i2+1]>>7;\
466 int U= (uvbuf1[i ])>>7;\
467 int V= (uvbuf1[i+VOFW])>>7;\
468
469#define YSCALE_YUV_2_RGB1_C(type) \
470 YSCALE_YUV_2_PACKED1_C\
471 type *r, *b, *g;\
472 r = (type *)c->table_rV[V];\
473 g = (type *)(c->table_gU[U] + c->table_gV[V]);\
474 b = (type *)c->table_bU[U];\
475
476#define YSCALE_YUV_2_PACKED1B_C \
477 for (i=0; i<(dstW>>1); i++){\
478 const int i2= 2*i;\
479 int Y1= buf0[i2 ]>>7;\
480 int Y2= buf0[i2+1]>>7;\
481 int U= (uvbuf0[i ] + uvbuf1[i ])>>8;\
482 int V= (uvbuf0[i+VOFW] + uvbuf1[i+VOFW])>>8;\
483
484#define YSCALE_YUV_2_RGB1B_C(type) \
485 YSCALE_YUV_2_PACKED1B_C\
486 type *r, *b, *g;\
487 r = (type *)c->table_rV[V];\
488 g = (type *)(c->table_gU[U] + c->table_gV[V]);\
489 b = (type *)c->table_bU[U];\
490
491#define YSCALE_YUV_2_ANYRGB_C(func, func2)\
492 switch(c->dstFormat)\
493 {\
494 case PIX_FMT_RGB32:\
495 case PIX_FMT_BGR32:\
496 func(uint32_t)\
497 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
498 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
499 } \
500 break;\
501 case PIX_FMT_RGB24:\
502 func(uint8_t)\
503 ((uint8_t*)dest)[0]= r[Y1];\
504 ((uint8_t*)dest)[1]= g[Y1];\
505 ((uint8_t*)dest)[2]= b[Y1];\
506 ((uint8_t*)dest)[3]= r[Y2];\
507 ((uint8_t*)dest)[4]= g[Y2];\
508 ((uint8_t*)dest)[5]= b[Y2];\
509 dest+=6;\
510 }\
511 break;\
512 case PIX_FMT_BGR24:\
513 func(uint8_t)\
514 ((uint8_t*)dest)[0]= b[Y1];\
515 ((uint8_t*)dest)[1]= g[Y1];\
516 ((uint8_t*)dest)[2]= r[Y1];\
517 ((uint8_t*)dest)[3]= b[Y2];\
518 ((uint8_t*)dest)[4]= g[Y2];\
519 ((uint8_t*)dest)[5]= r[Y2];\
520 dest+=6;\
521 }\
522 break;\
523 case PIX_FMT_RGB565:\
524 case PIX_FMT_BGR565:\
525 {\
526 const int dr1= dither_2x2_8[y&1 ][0];\
527 const int dg1= dither_2x2_4[y&1 ][0];\
528 const int db1= dither_2x2_8[(y&1)^1][0];\
529 const int dr2= dither_2x2_8[y&1 ][1];\
530 const int dg2= dither_2x2_4[y&1 ][1];\
531 const int db2= dither_2x2_8[(y&1)^1][1];\
532 func(uint16_t)\
533 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
534 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
535 }\
536 }\
537 break;\
538 case PIX_FMT_RGB555:\
539 case PIX_FMT_BGR555:\
540 {\
541 const int dr1= dither_2x2_8[y&1 ][0];\
542 const int dg1= dither_2x2_8[y&1 ][1];\
543 const int db1= dither_2x2_8[(y&1)^1][0];\
544 const int dr2= dither_2x2_8[y&1 ][1];\
545 const int dg2= dither_2x2_8[y&1 ][0];\
546 const int db2= dither_2x2_8[(y&1)^1][1];\
547 func(uint16_t)\
548 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
549 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
550 }\
551 }\
552 break;\
553 case PIX_FMT_RGB8:\
554 case PIX_FMT_BGR8:\
555 {\
556 const uint8_t * const d64= dither_8x8_73[y&7];\
557 const uint8_t * const d32= dither_8x8_32[y&7];\
558 func(uint8_t)\
559 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
560 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
561 }\
562 }\
563 break;\
564 case PIX_FMT_RGB4:\
565 case PIX_FMT_BGR4:\
566 {\
567 const uint8_t * const d64= dither_8x8_73 [y&7];\
568 const uint8_t * const d128=dither_8x8_220[y&7];\
569 func(uint8_t)\
570 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
571 + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
572 }\
573 }\
574 break;\
575 case PIX_FMT_RGB4_BYTE:\
576 case PIX_FMT_BGR4_BYTE:\
577 {\
578 const uint8_t * const d64= dither_8x8_73 [y&7];\
579 const uint8_t * const d128=dither_8x8_220[y&7];\
580 func(uint8_t)\
581 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
582 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
583 }\
584 }\
585 break;\
586 case PIX_FMT_MONOBLACK:\
587 {\
588 const uint8_t * const d128=dither_8x8_220[y&7];\
589 uint8_t *g= c->table_gU[128] + c->table_gV[128];\
590 for (i=0; i<dstW-7; i+=8){\
591 int acc;\
592 acc = g[((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19) + d128[0]];\
593 acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
594 acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
595 acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
596 acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
597 acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
598 acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
599 acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
600 ((uint8_t*)dest)[0]= acc;\
601 dest++;\
602 }\
603\
604/*\
605((uint8_t*)dest)-= dstW>>4;\
606{\
607 int acc=0;\
608 int left=0;\
609 static int top[1024];\
610 static int last_new[1024][1024];\
611 static int last_in3[1024][1024];\
612 static int drift[1024][1024];\
613 int topLeft=0;\
614 int shift=0;\
615 int count=0;\
616 const uint8_t * const d128=dither_8x8_220[y&7];\
617 int error_new=0;\
618 int error_in3=0;\
619 int f=0;\
620 \
621 for (i=dstW>>1; i<dstW; i++){\
622 int in= ((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19);\
623 int in2 = (76309 * (in - 16) + 32768) >> 16;\
624 int in3 = (in2 < 0) ? 0 : ((in2 > 255) ? 255 : in2);\
625 int old= (left*7 + topLeft + top[i]*5 + top[i+1]*3)/20 + in3\
626 + (last_new[y][i] - in3)*f/256;\
627 int new= old> 128 ? 255 : 0;\
628\
629 error_new+= FFABS(last_new[y][i] - new);\
630 error_in3+= FFABS(last_in3[y][i] - in3);\
631 f= error_new - error_in3*4;\
632 if (f<0) f=0;\
633 if (f>256) f=256;\
634\
635 topLeft= top[i];\
636 left= top[i]= old - new;\
637 last_new[y][i]= new;\
638 last_in3[y][i]= in3;\
639\
640 acc+= acc + (new&1);\
641 if ((i&7)==6){\
642 ((uint8_t*)dest)[0]= acc;\
643 ((uint8_t*)dest)++;\
644 }\
645 }\
646}\
647*/\
648 }\
649 break;\
650 case PIX_FMT_YUYV422:\
651 func2\
652 ((uint8_t*)dest)[2*i2+0]= Y1;\
653 ((uint8_t*)dest)[2*i2+1]= U;\
654 ((uint8_t*)dest)[2*i2+2]= Y2;\
655 ((uint8_t*)dest)[2*i2+3]= V;\
656 } \
657 break;\
658 case PIX_FMT_UYVY422:\
659 func2\
660 ((uint8_t*)dest)[2*i2+0]= U;\
661 ((uint8_t*)dest)[2*i2+1]= Y1;\
662 ((uint8_t*)dest)[2*i2+2]= V;\
663 ((uint8_t*)dest)[2*i2+3]= Y2;\
664 } \
665 break;\
666 }\
667
668
669static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
670 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
671 uint8_t *dest, int dstW, int y)
672{
673 int i;
674 switch(c->dstFormat)
675 {
676 case PIX_FMT_BGR32:
677 case PIX_FMT_RGB32:
678 YSCALE_YUV_2_RGBX_C(uint32_t)
679 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];
680 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];
681 }
682 break;
683 case PIX_FMT_RGB24:
684 YSCALE_YUV_2_RGBX_C(uint8_t)
685 ((uint8_t*)dest)[0]= r[Y1];
686 ((uint8_t*)dest)[1]= g[Y1];
687 ((uint8_t*)dest)[2]= b[Y1];
688 ((uint8_t*)dest)[3]= r[Y2];
689 ((uint8_t*)dest)[4]= g[Y2];
690 ((uint8_t*)dest)[5]= b[Y2];
691 dest+=6;
692 }
693 break;
694 case PIX_FMT_BGR24:
695 YSCALE_YUV_2_RGBX_C(uint8_t)
696 ((uint8_t*)dest)[0]= b[Y1];
697 ((uint8_t*)dest)[1]= g[Y1];
698 ((uint8_t*)dest)[2]= r[Y1];
699 ((uint8_t*)dest)[3]= b[Y2];
700 ((uint8_t*)dest)[4]= g[Y2];
701 ((uint8_t*)dest)[5]= r[Y2];
702 dest+=6;
703 }
704 break;
705 case PIX_FMT_RGB565:
706 case PIX_FMT_BGR565:
707 {
708 const int dr1= dither_2x2_8[y&1 ][0];
709 const int dg1= dither_2x2_4[y&1 ][0];
710 const int db1= dither_2x2_8[(y&1)^1][0];
711 const int dr2= dither_2x2_8[y&1 ][1];
712 const int dg2= dither_2x2_4[y&1 ][1];
713 const int db2= dither_2x2_8[(y&1)^1][1];
714 YSCALE_YUV_2_RGBX_C(uint16_t)
715 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
716 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
717 }
718 }
719 break;
720 case PIX_FMT_RGB555:
721 case PIX_FMT_BGR555:
722 {
723 const int dr1= dither_2x2_8[y&1 ][0];
724 const int dg1= dither_2x2_8[y&1 ][1];
725 const int db1= dither_2x2_8[(y&1)^1][0];
726 const int dr2= dither_2x2_8[y&1 ][1];
727 const int dg2= dither_2x2_8[y&1 ][0];
728 const int db2= dither_2x2_8[(y&1)^1][1];
729 YSCALE_YUV_2_RGBX_C(uint16_t)
730 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
731 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
732 }
733 }
734 break;
735 case PIX_FMT_RGB8:
736 case PIX_FMT_BGR8:
737 {
738 const uint8_t * const d64= dither_8x8_73[y&7];
739 const uint8_t * const d32= dither_8x8_32[y&7];
740 YSCALE_YUV_2_RGBX_C(uint8_t)
741 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];
742 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];
743 }
744 }
745 break;
746 case PIX_FMT_RGB4:
747 case PIX_FMT_BGR4:
748 {
749 const uint8_t * const d64= dither_8x8_73 [y&7];
750 const uint8_t * const d128=dither_8x8_220[y&7];
751 YSCALE_YUV_2_RGBX_C(uint8_t)
752 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]
753 +((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);
754 }
755 }
756 break;
757 case PIX_FMT_RGB4_BYTE:
758 case PIX_FMT_BGR4_BYTE:
759 {
760 const uint8_t * const d64= dither_8x8_73 [y&7];
761 const uint8_t * const d128=dither_8x8_220[y&7];
762 YSCALE_YUV_2_RGBX_C(uint8_t)
763 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];
764 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];
765 }
766 }
767 break;
768 case PIX_FMT_MONOBLACK:
769 {
770 const uint8_t * const d128=dither_8x8_220[y&7];
771 uint8_t *g= c->table_gU[128] + c->table_gV[128];
772 int acc=0;
773 for (i=0; i<dstW-1; i+=2){
774 int j;
775 int Y1=1<<18;
776 int Y2=1<<18;
777
778 for (j=0; j<lumFilterSize; j++)
779 {
780 Y1 += lumSrc[j][i] * lumFilter[j];
781 Y2 += lumSrc[j][i+1] * lumFilter[j];
782 }
783 Y1>>=19;
784 Y2>>=19;
785 if ((Y1|Y2)&256)
786 {
787 if (Y1>255) Y1=255;
788 else if (Y1<0)Y1=0;
789 if (Y2>255) Y2=255;
790 else if (Y2<0)Y2=0;
791 }
792 acc+= acc + g[Y1+d128[(i+0)&7]];
793 acc+= acc + g[Y2+d128[(i+1)&7]];
794 if ((i&7)==6){
795 ((uint8_t*)dest)[0]= acc;
796 dest++;
797 }
798 }
799 }
800 break;
801 case PIX_FMT_YUYV422:
802 YSCALE_YUV_2_PACKEDX_C(void)
803 ((uint8_t*)dest)[2*i2+0]= Y1;
804 ((uint8_t*)dest)[2*i2+1]= U;
805 ((uint8_t*)dest)[2*i2+2]= Y2;
806 ((uint8_t*)dest)[2*i2+3]= V;
807 }
808 break;
809 case PIX_FMT_UYVY422:
810 YSCALE_YUV_2_PACKEDX_C(void)
811 ((uint8_t*)dest)[2*i2+0]= U;
812 ((uint8_t*)dest)[2*i2+1]= Y1;
813 ((uint8_t*)dest)[2*i2+2]= V;
814 ((uint8_t*)dest)[2*i2+3]= Y2;
815 }
816 break;
817 }
818}
819
820
821//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
822//Plain C versions
823#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) || !defined(CONFIG_GPL)
824#define COMPILE_C
825#endif
826
827#ifdef ARCH_POWERPC
828#if (defined (HAVE_ALTIVEC) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
829#define COMPILE_ALTIVEC
830#endif //HAVE_ALTIVEC
831#endif //ARCH_POWERPC
832
833#if defined(ARCH_X86)
834
835#if ((defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
836#define COMPILE_MMX
837#endif
838
839#if (defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
840#define COMPILE_MMX2
841#endif
842
843#if ((defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
844#define COMPILE_3DNOW
845#endif
846#endif //ARCH_X86 || ARCH_X86_64
847
848#undef HAVE_MMX
849#undef HAVE_MMX2
850#undef HAVE_3DNOW
851
852#ifdef COMPILE_C
853#undef HAVE_MMX
854#undef HAVE_MMX2
855#undef HAVE_3DNOW
856#undef HAVE_ALTIVEC
857#define RENAME(a) a ## _C
858#include "swscale_template.c"
859#endif
860
861#ifdef COMPILE_ALTIVEC
862#undef RENAME
863#define HAVE_ALTIVEC
864#define RENAME(a) a ## _altivec
865#include "swscale_template.c"
866#endif
867
868#if defined(ARCH_X86)
869
870//X86 versions
871/*
872#undef RENAME
873#undef HAVE_MMX
874#undef HAVE_MMX2
875#undef HAVE_3DNOW
876#define ARCH_X86
877#define RENAME(a) a ## _X86
878#include "swscale_template.c"
879*/
880//MMX versions
881#ifdef COMPILE_MMX
882#undef RENAME
883#define HAVE_MMX
884#undef HAVE_MMX2
885#undef HAVE_3DNOW
886#define RENAME(a) a ## _MMX
887#include "swscale_template.c"
888#endif
889
890//MMX2 versions
891#ifdef COMPILE_MMX2
892#undef RENAME
893#define HAVE_MMX
894#define HAVE_MMX2
895#undef HAVE_3DNOW
896#define RENAME(a) a ## _MMX2
897#include "swscale_template.c"
898#endif
899
900//3DNOW versions
901#ifdef COMPILE_3DNOW
902#undef RENAME
903#define HAVE_MMX
904#undef HAVE_MMX2
905#define HAVE_3DNOW
906#define RENAME(a) a ## _3DNow
907#include "swscale_template.c"
908#endif
909
910#endif //ARCH_X86 || ARCH_X86_64
911
912// minor note: the HAVE_xyz is messed up after that line so don't use it
913
914static double getSplineCoeff(double a, double b, double c, double d, double dist)
915{
916// printf("%f %f %f %f %f\n", a,b,c,d,dist);
917 if (dist<=1.0) return ((d*dist + c)*dist + b)*dist +a;
918 else return getSplineCoeff( 0.0,
919 b+ 2.0*c + 3.0*d,
920 c + 3.0*d,
921 -b- 3.0*c - 6.0*d,
922 dist-1.0);
923}
924
925static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
926 int srcW, int dstW, int filterAlign, int one, int flags,
927 SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
928{
929 int i;
930 int filterSize;
931 int filter2Size;
932 int minFilterSize;
933 double *filter=NULL;
934 double *filter2=NULL;
935 int ret= -1;
936#if defined(ARCH_X86)
937 if (flags & SWS_CPU_CAPS_MMX)
938 asm volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
939#endif
940
941 // Note the +1 is for the MMXscaler which reads over the end
942 *filterPos = av_malloc((dstW+1)*sizeof(int16_t));
943
944 if (FFABS(xInc - 0x10000) <10) // unscaled
945 {
946 int i;
947 filterSize= 1;
948 filter= av_malloc(dstW*sizeof(double)*filterSize);
949 for (i=0; i<dstW*filterSize; i++) filter[i]=0;
950
951 for (i=0; i<dstW; i++)
952 {
953 filter[i*filterSize]=1;
954 (*filterPos)[i]=i;
955 }
956
957 }
958 else if (flags&SWS_POINT) // lame looking point sampling mode
959 {
960 int i;
961 int xDstInSrc;
962 filterSize= 1;
963 filter= av_malloc(dstW*sizeof(double)*filterSize);
964
965 xDstInSrc= xInc/2 - 0x8000;
966 for (i=0; i<dstW; i++)
967 {
968 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
969
970 (*filterPos)[i]= xx;
971 filter[i]= 1.0;
972 xDstInSrc+= xInc;
973 }
974 }
975 else if ((xInc <= (1<<16) && (flags&SWS_AREA)) || (flags&SWS_FAST_BILINEAR)) // bilinear upscale
976 {
977 int i;
978 int xDstInSrc;
979 if (flags&SWS_BICUBIC) filterSize= 4;
980 else if (flags&SWS_X ) filterSize= 4;
981 else filterSize= 2; // SWS_BILINEAR / SWS_AREA
982 filter= av_malloc(dstW*sizeof(double)*filterSize);
983
984 xDstInSrc= xInc/2 - 0x8000;
985 for (i=0; i<dstW; i++)
986 {
987 int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
988 int j;
989
990 (*filterPos)[i]= xx;
991 //Bilinear upscale / linear interpolate / Area averaging
992 for (j=0; j<filterSize; j++)
993 {
994 double d= FFABS((xx<<16) - xDstInSrc)/(double)(1<<16);
995 double coeff= 1.0 - d;
996 if (coeff<0) coeff=0;
997 filter[i*filterSize + j]= coeff;
998 xx++;
999 }
1000 xDstInSrc+= xInc;
1001 }
1002 }
1003 else
1004 {
1005 double xDstInSrc;
1006 double sizeFactor, filterSizeInSrc;
1007 const double xInc1= (double)xInc / (double)(1<<16);
1008
1009 if (flags&SWS_BICUBIC) sizeFactor= 4.0;
1010 else if (flags&SWS_X) sizeFactor= 8.0;
1011 else if (flags&SWS_AREA) sizeFactor= 1.0; //downscale only, for upscale it is bilinear
1012 else if (flags&SWS_GAUSS) sizeFactor= 8.0; // infinite ;)
1013 else if (flags&SWS_LANCZOS) sizeFactor= param[0] != SWS_PARAM_DEFAULT ? 2.0*param[0] : 6.0;
1014 else if (flags&SWS_SINC) sizeFactor= 20.0; // infinite ;)
1015 else if (flags&SWS_SPLINE) sizeFactor= 20.0; // infinite ;)
1016 else if (flags&SWS_BILINEAR) sizeFactor= 2.0;
1017 else {
1018 sizeFactor= 0.0; //GCC warning killer
1019 assert(0);
1020 }
1021
1022 if (xInc1 <= 1.0) filterSizeInSrc= sizeFactor; // upscale
1023 else filterSizeInSrc= sizeFactor*srcW / (double)dstW;
1024
1025 filterSize= (int)ceil(1 + filterSizeInSrc); // will be reduced later if possible
1026 if (filterSize > srcW-2) filterSize=srcW-2;
1027
1028 filter= av_malloc(dstW*sizeof(double)*filterSize);
1029
1030 xDstInSrc= xInc1 / 2.0 - 0.5;
1031 for (i=0; i<dstW; i++)
1032 {
1033 int xx= (int)(xDstInSrc - (filterSize-1)*0.5 + 0.5);
1034 int j;
1035 (*filterPos)[i]= xx;
1036 for (j=0; j<filterSize; j++)
1037 {
1038 double d= FFABS(xx - xDstInSrc)/filterSizeInSrc*sizeFactor;
1039 double coeff;
1040 if (flags & SWS_BICUBIC)
1041 {
1042 double B= param[0] != SWS_PARAM_DEFAULT ? param[0] : 0.0;
1043 double C= param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6;
1044
1045 if (d<1.0)
1046 coeff = (12-9*B-6*C)*d*d*d + (-18+12*B+6*C)*d*d + 6-2*B;
1047 else if (d<2.0)
1048 coeff = (-B-6*C)*d*d*d + (6*B+30*C)*d*d + (-12*B-48*C)*d +8*B+24*C;
1049 else
1050 coeff=0.0;
1051 }
1052/* else if (flags & SWS_X)
1053 {
1054 double p= param ? param*0.01 : 0.3;
1055 coeff = d ? sin(d*PI)/(d*PI) : 1.0;
1056 coeff*= pow(2.0, - p*d*d);
1057 }*/
1058 else if (flags & SWS_X)
1059 {
1060 double A= param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
1061
1062 if (d<1.0)
1063 coeff = cos(d*PI);
1064 else
1065 coeff=-1.0;
1066 if (coeff<0.0) coeff= -pow(-coeff, A);
1067 else coeff= pow( coeff, A);
1068 coeff= coeff*0.5 + 0.5;
1069 }
1070 else if (flags & SWS_AREA)
1071 {
1072 double srcPixelSize= 1.0/xInc1;
1073 if (d + srcPixelSize/2 < 0.5) coeff= 1.0;
1074 else if (d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
1075 else coeff=0.0;
1076 }
1077 else if (flags & SWS_GAUSS)
1078 {
1079 double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
1080 coeff = pow(2.0, - p*d*d);
1081 }
1082 else if (flags & SWS_SINC)
1083 {
1084 coeff = d ? sin(d*PI)/(d*PI) : 1.0;
1085 }
1086 else if (flags & SWS_LANCZOS)
1087 {
1088 double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
1089 coeff = d ? sin(d*PI)*sin(d*PI/p)/(d*d*PI*PI/p) : 1.0;
1090 if (d>p) coeff=0;
1091 }
1092 else if (flags & SWS_BILINEAR)
1093 {
1094 coeff= 1.0 - d;
1095 if (coeff<0) coeff=0;
1096 }
1097 else if (flags & SWS_SPLINE)
1098 {
1099 double p=-2.196152422706632;
1100 coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, d);
1101 }
1102 else {
1103 coeff= 0.0; //GCC warning killer
1104 assert(0);
1105 }
1106
1107 filter[i*filterSize + j]= coeff;
1108 xx++;
1109 }
1110 xDstInSrc+= xInc1;
1111 }
1112 }
1113
1114 /* apply src & dst Filter to filter -> filter2
1115 av_free(filter);
1116 */
1117 assert(filterSize>0);
1118 filter2Size= filterSize;
1119 if (srcFilter) filter2Size+= srcFilter->length - 1;
1120 if (dstFilter) filter2Size+= dstFilter->length - 1;
1121 assert(filter2Size>0);
1122 filter2= av_malloc(filter2Size*dstW*sizeof(double));
1123
1124 for (i=0; i<dstW; i++)
1125 {
1126 int j;
1127 SwsVector scaleFilter;
1128 SwsVector *outVec;
1129
1130 scaleFilter.coeff= filter + i*filterSize;
1131 scaleFilter.length= filterSize;
1132
1133 if (srcFilter) outVec= sws_getConvVec(srcFilter, &scaleFilter);
1134 else outVec= &scaleFilter;
1135
1136 assert(outVec->length == filter2Size);
1137 //FIXME dstFilter
1138
1139 for (j=0; j<outVec->length; j++)
1140 {
1141 filter2[i*filter2Size + j]= outVec->coeff[j];
1142 }
1143
1144 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
1145
1146 if (outVec != &scaleFilter) sws_freeVec(outVec);
1147 }
1148 av_freep(&filter);
1149
1150 /* try to reduce the filter-size (step1 find size and shift left) */
1151 // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
1152 minFilterSize= 0;
1153 for (i=dstW-1; i>=0; i--)
1154 {
1155 int min= filter2Size;
1156 int j;
1157 double cutOff=0.0;
1158
1159 /* get rid off near zero elements on the left by shifting left */
1160 for (j=0; j<filter2Size; j++)
1161 {
1162 int k;
1163 cutOff += FFABS(filter2[i*filter2Size]);
1164
1165 if (cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1166
1167 /* preserve monotonicity because the core can't handle the filter otherwise */
1168 if (i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
1169
1170 // Move filter coeffs left
1171 for (k=1; k<filter2Size; k++)
1172 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
1173 filter2[i*filter2Size + k - 1]= 0.0;
1174 (*filterPos)[i]++;
1175 }
1176
1177 cutOff=0.0;
1178 /* count near zeros on the right */
1179 for (j=filter2Size-1; j>0; j--)
1180 {
1181 cutOff += FFABS(filter2[i*filter2Size + j]);
1182
1183 if (cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1184 min--;
1185 }
1186
1187 if (min>minFilterSize) minFilterSize= min;
1188 }
1189
1190 if (flags & SWS_CPU_CAPS_ALTIVEC) {
1191 // we can handle the special case 4,
1192 // so we don't want to go to the full 8
1193 if (minFilterSize < 5)
1194 filterAlign = 4;
1195
1196 // we really don't want to waste our time
1197 // doing useless computation, so fall-back on
1198 // the scalar C code for very small filter.
1199 // vectorizing is worth it only if you have
1200 // decent-sized vector.
1201 if (minFilterSize < 3)
1202 filterAlign = 1;
1203 }
1204
1205 if (flags & SWS_CPU_CAPS_MMX) {
1206 // special case for unscaled vertical filtering
1207 if (minFilterSize == 1 && filterAlign == 2)
1208 filterAlign= 1;
1209 }
1210
1211 assert(minFilterSize > 0);
1212 filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
1213 assert(filterSize > 0);
1214 filter= av_malloc(filterSize*dstW*sizeof(double));
1215 if (filterSize >= MAX_FILTER_SIZE || !filter)
1216 goto error;
1217 *outFilterSize= filterSize;
1218
1219 if (flags&SWS_PRINT_INFO)
1220 av_log(NULL, AV_LOG_VERBOSE, "SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
1221 /* try to reduce the filter-size (step2 reduce it) */
1222 for (i=0; i<dstW; i++)
1223 {
1224 int j;
1225
1226 for (j=0; j<filterSize; j++)
1227 {
1228 if (j>=filter2Size) filter[i*filterSize + j]= 0.0;
1229 else filter[i*filterSize + j]= filter2[i*filter2Size + j];
1230 }
1231 }
1232
1233
1234 //FIXME try to align filterpos if possible
1235
1236 //fix borders
1237 for (i=0; i<dstW; i++)
1238 {
1239 int j;
1240 if ((*filterPos)[i] < 0)
1241 {
1242 // Move filter coeffs left to compensate for filterPos
1243 for (j=1; j<filterSize; j++)
1244 {
1245 int left= FFMAX(j + (*filterPos)[i], 0);
1246 filter[i*filterSize + left] += filter[i*filterSize + j];
1247 filter[i*filterSize + j]=0;
1248 }
1249 (*filterPos)[i]= 0;
1250 }
1251
1252 if ((*filterPos)[i] + filterSize > srcW)
1253 {
1254 int shift= (*filterPos)[i] + filterSize - srcW;
1255 // Move filter coeffs right to compensate for filterPos
1256 for (j=filterSize-2; j>=0; j--)
1257 {
1258 int right= FFMIN(j + shift, filterSize-1);
1259 filter[i*filterSize +right] += filter[i*filterSize +j];
1260 filter[i*filterSize +j]=0;
1261 }
1262 (*filterPos)[i]= srcW - filterSize;
1263 }
1264 }
1265
1266 // Note the +1 is for the MMXscaler which reads over the end
1267 /* align at 16 for AltiVec (needed by hScale_altivec_real) */
1268 *outFilter= av_mallocz(*outFilterSize*(dstW+1)*sizeof(int16_t));
1269
1270 /* Normalize & Store in outFilter */
1271 for (i=0; i<dstW; i++)
1272 {
1273 int j;
1274 double error=0;
1275 double sum=0;
1276 double scale= one;
1277
1278 for (j=0; j<filterSize; j++)
1279 {
1280 sum+= filter[i*filterSize + j];
1281 }
1282 scale/= sum;
1283 for (j=0; j<*outFilterSize; j++)
1284 {
1285 double v= filter[i*filterSize + j]*scale + error;
1286 int intV= floor(v + 0.5);
1287 (*outFilter)[i*(*outFilterSize) + j]= intV;
1288 error = v - intV;
1289 }
1290 }
1291
1292 (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1293 for (i=0; i<*outFilterSize; i++)
1294 {
1295 int j= dstW*(*outFilterSize);
1296 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1297 }
1298
1299 ret=0;
1300error:
1301 av_free(filter);
1302 av_free(filter2);
1303 return ret;
1304}
1305
1306#ifdef COMPILE_MMX2
1307static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1308{
1309 uint8_t *fragmentA;
1310 long imm8OfPShufW1A;
1311 long imm8OfPShufW2A;
1312 long fragmentLengthA;
1313 uint8_t *fragmentB;
1314 long imm8OfPShufW1B;
1315 long imm8OfPShufW2B;
1316 long fragmentLengthB;
1317 int fragmentPos;
1318
1319 int xpos, i;
1320
1321 // create an optimized horizontal scaling routine
1322
1323 //code fragment
1324
1325 asm volatile(
1326 "jmp 9f \n\t"
1327 // Begin
1328 "0: \n\t"
1329 "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
1330 "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
1331 "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
1332 "punpcklbw %%mm7, %%mm1 \n\t"
1333 "punpcklbw %%mm7, %%mm0 \n\t"
1334 "pshufw $0xFF, %%mm1, %%mm1 \n\t"
1335 "1: \n\t"
1336 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1337 "2: \n\t"
1338 "psubw %%mm1, %%mm0 \n\t"
1339 "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
1340 "pmullw %%mm3, %%mm0 \n\t"
1341 "psllw $7, %%mm1 \n\t"
1342 "paddw %%mm1, %%mm0 \n\t"
1343
1344 "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
1345
1346 "add $8, %%"REG_a" \n\t"
1347 // End
1348 "9: \n\t"
1349// "int $3 \n\t"
1350 "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
1351 "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
1352 "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
1353 "dec %1 \n\t"
1354 "dec %2 \n\t"
1355 "sub %0, %1 \n\t"
1356 "sub %0, %2 \n\t"
1357 "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
1358 "sub %0, %3 \n\t"
1359
1360
1361 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1362 "=r" (fragmentLengthA)
1363 );
1364
1365 asm volatile(
1366 "jmp 9f \n\t"
1367 // Begin
1368 "0: \n\t"
1369 "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
1370 "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
1371 "punpcklbw %%mm7, %%mm0 \n\t"
1372 "pshufw $0xFF, %%mm0, %%mm1 \n\t"
1373 "1: \n\t"
1374 "pshufw $0xFF, %%mm0, %%mm0 \n\t"
1375 "2: \n\t"
1376 "psubw %%mm1, %%mm0 \n\t"
1377 "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
1378 "pmullw %%mm3, %%mm0 \n\t"
1379 "psllw $7, %%mm1 \n\t"
1380 "paddw %%mm1, %%mm0 \n\t"
1381
1382 "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
1383
1384 "add $8, %%"REG_a" \n\t"
1385 // End
1386 "9: \n\t"
1387// "int $3 \n\t"
1388 "lea " LOCAL_MANGLE(0b) ", %0 \n\t"
1389 "lea " LOCAL_MANGLE(1b) ", %1 \n\t"
1390 "lea " LOCAL_MANGLE(2b) ", %2 \n\t"
1391 "dec %1 \n\t"
1392 "dec %2 \n\t"
1393 "sub %0, %1 \n\t"
1394 "sub %0, %2 \n\t"
1395 "lea " LOCAL_MANGLE(9b) ", %3 \n\t"
1396 "sub %0, %3 \n\t"
1397
1398
1399 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1400 "=r" (fragmentLengthB)
1401 );
1402
1403 xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1404 fragmentPos=0;
1405
1406 for (i=0; i<dstW/numSplits; i++)
1407 {
1408 int xx=xpos>>16;
1409
1410 if ((i&3) == 0)
1411 {
1412 int a=0;
1413 int b=((xpos+xInc)>>16) - xx;
1414 int c=((xpos+xInc*2)>>16) - xx;
1415 int d=((xpos+xInc*3)>>16) - xx;
1416
1417 filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9;
1418 filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9;
1419 filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1420 filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1421 filterPos[i/2]= xx;
1422
1423 if (d+1<4)
1424 {
1425 int maxShift= 3-(d+1);
1426 int shift=0;
1427
1428 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1429
1430 funnyCode[fragmentPos + imm8OfPShufW1B]=
1431 (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1432 funnyCode[fragmentPos + imm8OfPShufW2B]=
1433 a | (b<<2) | (c<<4) | (d<<6);
1434
1435 if (i+3>=dstW) shift=maxShift; //avoid overread
1436 else if ((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1437
1438 if (shift && i>=shift)
1439 {
1440 funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1441 funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1442 filterPos[i/2]-=shift;
1443 }
1444
1445 fragmentPos+= fragmentLengthB;
1446 }
1447 else
1448 {
1449 int maxShift= 3-d;
1450 int shift=0;
1451
1452 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
1453
1454 funnyCode[fragmentPos + imm8OfPShufW1A]=
1455 funnyCode[fragmentPos + imm8OfPShufW2A]=
1456 a | (b<<2) | (c<<4) | (d<<6);
1457
1458 if (i+4>=dstW) shift=maxShift; //avoid overread
1459 else if ((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
1460
1461 if (shift && i>=shift)
1462 {
1463 funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1464 funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1465 filterPos[i/2]-=shift;
1466 }
1467
1468 fragmentPos+= fragmentLengthA;
1469 }
1470
1471 funnyCode[fragmentPos]= RET;
1472 }
1473 xpos+=xInc;
1474 }
1475 filterPos[i/2]= xpos>>16; // needed to jump to the next part
1476}
1477#endif /* COMPILE_MMX2 */
1478
1479static void globalInit(void){
1480 // generating tables:
1481 int i;
1482 for (i=0; i<768; i++){
1483 int c= av_clip_uint8(i-256);
1484 clip_table[i]=c;
1485 }
1486}
1487
1488static SwsFunc getSwsFunc(int flags){
1489
1490#if defined(RUNTIME_CPUDETECT) && defined (CONFIG_GPL)
1491#if defined(ARCH_X86)
1492 // ordered per speed fastest first
1493 if (flags & SWS_CPU_CAPS_MMX2)
1494 return swScale_MMX2;
1495 else if (flags & SWS_CPU_CAPS_3DNOW)
1496 return swScale_3DNow;
1497 else if (flags & SWS_CPU_CAPS_MMX)
1498 return swScale_MMX;
1499 else
1500 return swScale_C;
1501
1502#else
1503#ifdef ARCH_POWERPC
1504 if (flags & SWS_CPU_CAPS_ALTIVEC)
1505 return swScale_altivec;
1506 else
1507 return swScale_C;
1508#endif
1509 return swScale_C;
1510#endif /* defined(ARCH_X86) */
1511#else //RUNTIME_CPUDETECT
1512#ifdef HAVE_MMX2
1513 return swScale_MMX2;
1514#elif defined (HAVE_3DNOW)
1515 return swScale_3DNow;
1516#elif defined (HAVE_MMX)
1517 return swScale_MMX;
1518#elif defined (HAVE_ALTIVEC)
1519 return swScale_altivec;
1520#else
1521 return swScale_C;
1522#endif
1523#endif //!RUNTIME_CPUDETECT
1524}
1525
1526static int PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1527 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1528 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1529 /* Copy Y plane */
1530 if (dstStride[0]==srcStride[0] && srcStride[0] > 0)
1531 memcpy(dst, src[0], srcSliceH*dstStride[0]);
1532 else
1533 {
1534 int i;
1535 uint8_t *srcPtr= src[0];
1536 uint8_t *dstPtr= dst;
1537 for (i=0; i<srcSliceH; i++)
1538 {
1539 memcpy(dstPtr, srcPtr, c->srcW);
1540 srcPtr+= srcStride[0];
1541 dstPtr+= dstStride[0];
1542 }
1543 }
1544 dst = dstParam[1] + dstStride[1]*srcSliceY/2;
1545 if (c->dstFormat == PIX_FMT_NV12)
1546 interleaveBytes(src[1], src[2], dst, c->srcW/2, srcSliceH/2, srcStride[1], srcStride[2], dstStride[0]);
1547 else
1548 interleaveBytes(src[2], src[1], dst, c->srcW/2, srcSliceH/2, srcStride[2], srcStride[1], dstStride[0]);
1549
1550 return srcSliceH;
1551}
1552
1553static int PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1554 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1555 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1556
1557 yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
1558
1559 return srcSliceH;
1560}
1561
1562static int PlanarToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1563 int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1564 uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1565
1566 yv12touyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
1567
1568 return srcSliceH;
1569}
1570
1571/* {RGB,BGR}{15,16,24,32} -> {RGB,BGR}{15,16,24,32} */
1572static int rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1573 int srcSliceH, uint8_t* dst[], int dstStride[]){
1574 const int srcFormat= c->srcFormat;
1575 const int dstFormat= c->dstFormat;
1576 const int srcBpp= (fmt_depth(srcFormat) + 7) >> 3;
1577 const int dstBpp= (fmt_depth(dstFormat) + 7) >> 3;
1578 const int srcId= fmt_depth(srcFormat) >> 2; /* 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8 */
1579 const int dstId= fmt_depth(dstFormat) >> 2;
1580 void (*conv)(const uint8_t *src, uint8_t *dst, long src_size)=NULL;
1581
1582 /* BGR -> BGR */
1583 if ( (isBGR(srcFormat) && isBGR(dstFormat))
1584 || (isRGB(srcFormat) && isRGB(dstFormat))){
1585 switch(srcId | (dstId<<4)){
1586 case 0x34: conv= rgb16to15; break;
1587 case 0x36: conv= rgb24to15; break;
1588 case 0x38: conv= rgb32to15; break;
1589 case 0x43: conv= rgb15to16; break;
1590 case 0x46: conv= rgb24to16; break;
1591 case 0x48: conv= rgb32to16; break;
1592 case 0x63: conv= rgb15to24; break;
1593 case 0x64: conv= rgb16to24; break;
1594 case 0x68: conv= rgb32to24; break;
1595 case 0x83: conv= rgb15to32; break;
1596 case 0x84: conv= rgb16to32; break;
1597 case 0x86: conv= rgb24to32; break;
1598 default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
1599 sws_format_name(srcFormat), sws_format_name(dstFormat)); break;
1600 }
1601 }else if ( (isBGR(srcFormat) && isRGB(dstFormat))
1602 || (isRGB(srcFormat) && isBGR(dstFormat))){
1603 switch(srcId | (dstId<<4)){
1604 case 0x33: conv= rgb15tobgr15; break;
1605 case 0x34: conv= rgb16tobgr15; break;
1606 case 0x36: conv= rgb24tobgr15; break;
1607 case 0x38: conv= rgb32tobgr15; break;
1608 case 0x43: conv= rgb15tobgr16; break;
1609 case 0x44: conv= rgb16tobgr16; break;
1610 case 0x46: conv= rgb24tobgr16; break;
1611 case 0x48: conv= rgb32tobgr16; break;
1612 case 0x63: conv= rgb15tobgr24; break;
1613 case 0x64: conv= rgb16tobgr24; break;
1614 case 0x66: conv= rgb24tobgr24; break;
1615 case 0x68: conv= rgb32tobgr24; break;
1616 case 0x83: conv= rgb15tobgr32; break;
1617 case 0x84: conv= rgb16tobgr32; break;
1618 case 0x86: conv= rgb24tobgr32; break;
1619 case 0x88: conv= rgb32tobgr32; break;
1620 default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
1621 sws_format_name(srcFormat), sws_format_name(dstFormat)); break;
1622 }
1623 }else{
1624 av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
1625 sws_format_name(srcFormat), sws_format_name(dstFormat));
1626 }
1627
1628 if(conv)
1629 {
1630 if (dstStride[0]*srcBpp == srcStride[0]*dstBpp && srcStride[0] > 0)
1631 conv(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1632 else
1633 {
1634 int i;
1635 uint8_t *srcPtr= src[0];
1636 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1637
1638 for (i=0; i<srcSliceH; i++)
1639 {
1640 conv(srcPtr, dstPtr, c->srcW*srcBpp);
1641 srcPtr+= srcStride[0];
1642 dstPtr+= dstStride[0];
1643 }
1644 }
1645 }
1646 return srcSliceH;
1647}
1648
1649static int bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1650 int srcSliceH, uint8_t* dst[], int dstStride[]){
1651
1652 rgb24toyv12(
1653 src[0],
1654 dst[0]+ srcSliceY *dstStride[0],
1655 dst[1]+(srcSliceY>>1)*dstStride[1],
1656 dst[2]+(srcSliceY>>1)*dstStride[2],
1657 c->srcW, srcSliceH,
1658 dstStride[0], dstStride[1], srcStride[0]);
1659 return srcSliceH;
1660}
1661
1662static int yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1663 int srcSliceH, uint8_t* dst[], int dstStride[]){
1664 int i;
1665
1666 /* copy Y */
1667 if (srcStride[0]==dstStride[0] && srcStride[0] > 0)
1668 memcpy(dst[0]+ srcSliceY*dstStride[0], src[0], srcStride[0]*srcSliceH);
1669 else{
1670 uint8_t *srcPtr= src[0];
1671 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1672
1673 for (i=0; i<srcSliceH; i++)
1674 {
1675 memcpy(dstPtr, srcPtr, c->srcW);
1676 srcPtr+= srcStride[0];
1677 dstPtr+= dstStride[0];
1678 }
1679 }
1680
1681 if (c->dstFormat==PIX_FMT_YUV420P){
1682 planar2x(src[1], dst[1], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[1]);
1683 planar2x(src[2], dst[2], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[2]);
1684 }else{
1685 planar2x(src[1], dst[2], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[2]);
1686 planar2x(src[2], dst[1], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[1]);
1687 }
1688 return srcSliceH;
1689}
1690
1691/* unscaled copy like stuff (assumes nearly identical formats) */
1692static int packedCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1693 int srcSliceH, uint8_t* dst[], int dstStride[])
1694{
1695 if (dstStride[0]==srcStride[0] && srcStride[0] > 0)
1696 memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1697 else
1698 {
1699 int i;
1700 uint8_t *srcPtr= src[0];
1701 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1702 int length=0;
1703
1704 /* universal length finder */
1705 while(length+c->srcW <= FFABS(dstStride[0])
1706 && length+c->srcW <= FFABS(srcStride[0])) length+= c->srcW;
1707 assert(length!=0);
1708
1709 for (i=0; i<srcSliceH; i++)
1710 {
1711 memcpy(dstPtr, srcPtr, length);
1712 srcPtr+= srcStride[0];
1713 dstPtr+= dstStride[0];
1714 }
1715 }
1716 return srcSliceH;
1717}
1718
1719static int planarCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1720 int srcSliceH, uint8_t* dst[], int dstStride[])
1721{
1722 int plane;
1723 for (plane=0; plane<3; plane++)
1724 {
1725 int length= plane==0 ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample);
1726 int y= plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
1727 int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
1728
1729 if ((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
1730 {
1731 if (!isGray(c->dstFormat))
1732 memset(dst[plane], 128, dstStride[plane]*height);
1733 }
1734 else
1735 {
1736 if (dstStride[plane]==srcStride[plane] && srcStride[plane] > 0)
1737 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1738 else
1739 {
1740 int i;
1741 uint8_t *srcPtr= src[plane];
1742 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1743 for (i=0; i<height; i++)
1744 {
1745 memcpy(dstPtr, srcPtr, length);
1746 srcPtr+= srcStride[plane];
1747 dstPtr+= dstStride[plane];
1748 }
1749 }
1750 }
1751 }
1752 return srcSliceH;
1753}
1754
1755static int gray16togray(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1756 int srcSliceH, uint8_t* dst[], int dstStride[]){
1757
1758 int length= c->srcW;
1759 int y= srcSliceY;
1760 int height= srcSliceH;
1761 int i, j;
1762 uint8_t *srcPtr= src[0];
1763 uint8_t *dstPtr= dst[0] + dstStride[0]*y;
1764
1765 if (!isGray(c->dstFormat)){
1766 int height= -((-srcSliceH)>>c->chrDstVSubSample);
1767 memset(dst[1], 128, dstStride[1]*height);
1768 memset(dst[2], 128, dstStride[2]*height);
1769 }
1770 if (c->srcFormat == PIX_FMT_GRAY16LE) srcPtr++;
1771 for (i=0; i<height; i++)
1772 {
1773 for (j=0; j<length; j++) dstPtr[j] = srcPtr[j<<1];
1774 srcPtr+= srcStride[0];
1775 dstPtr+= dstStride[0];
1776 }
1777 return srcSliceH;
1778}
1779
1780static int graytogray16(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1781 int srcSliceH, uint8_t* dst[], int dstStride[]){
1782
1783 int length= c->srcW;
1784 int y= srcSliceY;
1785 int height= srcSliceH;
1786 int i, j;
1787 uint8_t *srcPtr= src[0];
1788 uint8_t *dstPtr= dst[0] + dstStride[0]*y;
1789 for (i=0; i<height; i++)
1790 {
1791 for (j=0; j<length; j++)
1792 {
1793 dstPtr[j<<1] = srcPtr[j];
1794 dstPtr[(j<<1)+1] = srcPtr[j];
1795 }
1796 srcPtr+= srcStride[0];
1797 dstPtr+= dstStride[0];
1798 }
1799 return srcSliceH;
1800}
1801
1802static int gray16swap(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1803 int srcSliceH, uint8_t* dst[], int dstStride[]){
1804
1805 int length= c->srcW;
1806 int y= srcSliceY;
1807 int height= srcSliceH;
1808 int i, j;
1809 uint16_t *srcPtr= (uint16_t*)src[0];
1810 uint16_t *dstPtr= (uint16_t*)(dst[0] + dstStride[0]*y/2);
1811 for (i=0; i<height; i++)
1812 {
1813 for (j=0; j<length; j++) dstPtr[j] = bswap_16(srcPtr[j]);
1814 srcPtr+= srcStride[0]/2;
1815 dstPtr+= dstStride[0]/2;
1816 }
1817 return srcSliceH;
1818}
1819
1820
1821static void getSubSampleFactors(int *h, int *v, int format){
1822 switch(format){
1823 case PIX_FMT_UYVY422:
1824 case PIX_FMT_YUYV422:
1825 *h=1;
1826 *v=0;
1827 break;
1828 case PIX_FMT_YUV420P:
1829 case PIX_FMT_YUVA420P:
1830 case PIX_FMT_GRAY16BE:
1831 case PIX_FMT_GRAY16LE:
1832 case PIX_FMT_GRAY8: //FIXME remove after different subsamplings are fully implemented
1833 case PIX_FMT_NV12:
1834 case PIX_FMT_NV21:
1835 *h=1;
1836 *v=1;
1837 break;
1838 case PIX_FMT_YUV440P:
1839 *h=0;
1840 *v=1;
1841 break;
1842 case PIX_FMT_YUV410P:
1843 *h=2;
1844 *v=2;
1845 break;
1846 case PIX_FMT_YUV444P:
1847 *h=0;
1848 *v=0;
1849 break;
1850 case PIX_FMT_YUV422P:
1851 *h=1;
1852 *v=0;
1853 break;
1854 case PIX_FMT_YUV411P:
1855 *h=2;
1856 *v=0;
1857 break;
1858 default:
1859 *h=0;
1860 *v=0;
1861 break;
1862 }
1863}
1864
1865static uint16_t roundToInt16(int64_t f){
1866 int r= (f + (1<<15))>>16;
1867 if (r<-0x7FFF) return 0x8000;
1868 else if (r> 0x7FFF) return 0x7FFF;
1869 else return r;
1870}
1871
1872/**
1873 * @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
1874 * @param fullRange if 1 then the luma range is 0..255 if 0 it is 16..235
1875 * @return -1 if not supported
1876 */
1877int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation){
1878 int64_t crv = inv_table[0];
1879 int64_t cbu = inv_table[1];
1880 int64_t cgu = -inv_table[2];
1881 int64_t cgv = -inv_table[3];
1882 int64_t cy = 1<<16;
1883 int64_t oy = 0;
1884
1885 if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
1886 memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
1887 memcpy(c->dstColorspaceTable, table, sizeof(int)*4);
1888
1889 c->brightness= brightness;
1890 c->contrast = contrast;
1891 c->saturation= saturation;
1892 c->srcRange = srcRange;
1893 c->dstRange = dstRange;
1894
1895 c->uOffset= 0x0400040004000400LL;
1896 c->vOffset= 0x0400040004000400LL;
1897
1898 if (!srcRange){
1899 cy= (cy*255) / 219;
1900 oy= 16<<16;
1901 }else{
1902 crv= (crv*224) / 255;
1903 cbu= (cbu*224) / 255;
1904 cgu= (cgu*224) / 255;
1905 cgv= (cgv*224) / 255;
1906 }
1907
1908 cy = (cy *contrast )>>16;
1909 crv= (crv*contrast * saturation)>>32;
1910 cbu= (cbu*contrast * saturation)>>32;
1911 cgu= (cgu*contrast * saturation)>>32;
1912 cgv= (cgv*contrast * saturation)>>32;
1913
1914 oy -= 256*brightness;
1915
1916 c->yCoeff= roundToInt16(cy *8192) * 0x0001000100010001ULL;
1917 c->vrCoeff= roundToInt16(crv*8192) * 0x0001000100010001ULL;
1918 c->ubCoeff= roundToInt16(cbu*8192) * 0x0001000100010001ULL;
1919 c->vgCoeff= roundToInt16(cgv*8192) * 0x0001000100010001ULL;
1920 c->ugCoeff= roundToInt16(cgu*8192) * 0x0001000100010001ULL;
1921 c->yOffset= roundToInt16(oy * 8) * 0x0001000100010001ULL;
1922
1923 yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
1924 //FIXME factorize
1925
1926#ifdef COMPILE_ALTIVEC
1927 if (c->flags & SWS_CPU_CAPS_ALTIVEC)
1928 yuv2rgb_altivec_init_tables (c, inv_table, brightness, contrast, saturation);
1929#endif
1930 return 0;
1931}
1932
1933/**
1934 * @return -1 if not supported
1935 */
1936int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation){
1937 if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
1938
1939 *inv_table = c->srcColorspaceTable;
1940 *table = c->dstColorspaceTable;
1941 *srcRange = c->srcRange;
1942 *dstRange = c->dstRange;
1943 *brightness= c->brightness;
1944 *contrast = c->contrast;
1945 *saturation= c->saturation;
1946
1947 return 0;
1948}
1949
1950static int handle_jpeg(int *format)
1951{
1952 switch (*format) {
1953 case PIX_FMT_YUVJ420P:
1954 *format = PIX_FMT_YUV420P;
1955 return 1;
1956 case PIX_FMT_YUVJ422P:
1957 *format = PIX_FMT_YUV422P;
1958 return 1;
1959 case PIX_FMT_YUVJ444P:
1960 *format = PIX_FMT_YUV444P;
1961 return 1;
1962 case PIX_FMT_YUVJ440P:
1963 *format = PIX_FMT_YUV440P;
1964 return 1;
1965 default:
1966 return 0;
1967 }
1968}
1969
1970SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1971 SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param){
1972
1973 SwsContext *c;
1974 int i;
1975 int usesVFilter, usesHFilter;
1976 int unscaled, needsDither;
1977 int srcRange, dstRange;
1978 SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1979#if defined(ARCH_X86)
1980 if (flags & SWS_CPU_CAPS_MMX)
1981 asm volatile("emms\n\t"::: "memory");
1982#endif
1983
1984#if !defined(RUNTIME_CPUDETECT) || !defined (CONFIG_GPL) //ensure that the flags match the compiled variant if cpudetect is off
1985 flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN);
1986#ifdef HAVE_MMX2
1987 flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
1988#elif defined (HAVE_3DNOW)
1989 flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
1990#elif defined (HAVE_MMX)
1991 flags |= SWS_CPU_CAPS_MMX;
1992#elif defined (HAVE_ALTIVEC)
1993 flags |= SWS_CPU_CAPS_ALTIVEC;
1994#elif defined (ARCH_BFIN)
1995 flags |= SWS_CPU_CAPS_BFIN;
1996#endif
1997#endif /* RUNTIME_CPUDETECT */
1998 if (clip_table[512] != 255) globalInit();
1999 if (!rgb15to16) sws_rgb2rgb_init(flags);
2000
2001 unscaled = (srcW == dstW && srcH == dstH);
2002 needsDither= (isBGR(dstFormat) || isRGB(dstFormat))
2003 && (fmt_depth(dstFormat))<24
2004 && ((fmt_depth(dstFormat))<(fmt_depth(srcFormat)) || (!(isRGB(srcFormat) || isBGR(srcFormat))));
2005
2006 srcRange = handle_jpeg(&srcFormat);
2007 dstRange = handle_jpeg(&dstFormat);
2008
2009 if (!isSupportedIn(srcFormat))
2010 {
2011 av_log(NULL, AV_LOG_ERROR, "swScaler: %s is not supported as input pixel format\n", sws_format_name(srcFormat));
2012 return NULL;
2013 }
2014 if (!isSupportedOut(dstFormat))
2015 {
2016 av_log(NULL, AV_LOG_ERROR, "swScaler: %s is not supported as output pixel format\n", sws_format_name(dstFormat));
2017 return NULL;
2018 }
2019
2020 i= flags & ( SWS_POINT
2021 |SWS_AREA
2022 |SWS_BILINEAR
2023 |SWS_FAST_BILINEAR
2024 |SWS_BICUBIC
2025 |SWS_X
2026 |SWS_GAUSS
2027 |SWS_LANCZOS
2028 |SWS_SINC
2029 |SWS_SPLINE
2030 |SWS_BICUBLIN);
2031 if(!i || (i & (i-1)))
2032 {
2033 av_log(NULL, AV_LOG_ERROR, "swScaler: Exactly one scaler algorithm must be choosen\n");
2034 return NULL;
2035 }
2036
2037
2038 /* sanity check */
2039 if (srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
2040 {
2041 av_log(NULL, AV_LOG_ERROR, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
2042 srcW, srcH, dstW, dstH);
2043 return NULL;
2044 }
2045 if(srcW > VOFW || dstW > VOFW){
2046 av_log(NULL, AV_LOG_ERROR, "swScaler: Compile time max width is "AV_STRINGIFY(VOFW)" change VOF/VOFW and recompile\n");
2047 return NULL;
2048 }
2049
2050 if (!dstFilter) dstFilter= &dummyFilter;
2051 if (!srcFilter) srcFilter= &dummyFilter;
2052
2053 c= av_mallocz(sizeof(SwsContext));
2054
2055 c->av_class = &sws_context_class;
2056 c->srcW= srcW;
2057 c->srcH= srcH;
2058 c->dstW= dstW;
2059 c->dstH= dstH;
2060 c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
2061 c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
2062 c->flags= flags;
2063 c->dstFormat= dstFormat;
2064 c->srcFormat= srcFormat;
2065 c->vRounder= 4* 0x0001000100010001ULL;
2066
2067 usesHFilter= usesVFilter= 0;
2068 if (dstFilter->lumV && dstFilter->lumV->length>1) usesVFilter=1;
2069 if (dstFilter->lumH && dstFilter->lumH->length>1) usesHFilter=1;
2070 if (dstFilter->chrV && dstFilter->chrV->length>1) usesVFilter=1;
2071 if (dstFilter->chrH && dstFilter->chrH->length>1) usesHFilter=1;
2072 if (srcFilter->lumV && srcFilter->lumV->length>1) usesVFilter=1;
2073 if (srcFilter->lumH && srcFilter->lumH->length>1) usesHFilter=1;
2074 if (srcFilter->chrV && srcFilter->chrV->length>1) usesVFilter=1;
2075 if (srcFilter->chrH && srcFilter->chrH->length>1) usesHFilter=1;
2076
2077 getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
2078 getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
2079
2080 // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
2081 if ((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
2082
2083 // drop some chroma lines if the user wants it
2084 c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT;
2085 c->chrSrcVSubSample+= c->vChrDrop;
2086
2087 // drop every 2. pixel for chroma calculation unless user wants full chroma
2088 if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)
2089 && srcFormat!=PIX_FMT_RGB8 && srcFormat!=PIX_FMT_BGR8
2090 && srcFormat!=PIX_FMT_RGB4 && srcFormat!=PIX_FMT_BGR4
2091 && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE)
2092 c->chrSrcHSubSample=1;
2093
2094 if (param){
2095 c->param[0] = param[0];
2096 c->param[1] = param[1];
2097 }else{
2098 c->param[0] =
2099 c->param[1] = SWS_PARAM_DEFAULT;
2100 }
2101
2102 c->chrIntHSubSample= c->chrDstHSubSample;
2103 c->chrIntVSubSample= c->chrSrcVSubSample;
2104
2105 // Note the -((-x)>>y) is so that we always round toward +inf.
2106 c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
2107 c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
2108 c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
2109 c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
2110
2111 sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], srcRange, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, dstRange, 0, 1<<16, 1<<16);
2112
2113 /* unscaled special Cases */
2114 if (unscaled && !usesHFilter && !usesVFilter)
2115 {
2116 /* yv12_to_nv12 */
2117 if (srcFormat == PIX_FMT_YUV420P && (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21))
2118 {
2119 c->swScale= PlanarToNV12Wrapper;
2120 }
2121#ifdef CONFIG_GPL
2122 /* yuv2bgr */
2123 if ((srcFormat==PIX_FMT_YUV420P || srcFormat==PIX_FMT_YUV422P) && (isBGR(dstFormat) || isRGB(dstFormat)))
2124 {
2125 c->swScale= yuv2rgb_get_func_ptr(c);
2126 }
2127#endif
2128
2129 if (srcFormat==PIX_FMT_YUV410P && dstFormat==PIX_FMT_YUV420P)
2130 {
2131 c->swScale= yvu9toyv12Wrapper;
2132 }
2133
2134 /* bgr24toYV12 */
2135 if (srcFormat==PIX_FMT_BGR24 && dstFormat==PIX_FMT_YUV420P)
2136 c->swScale= bgr24toyv12Wrapper;
2137
2138 /* rgb/bgr -> rgb/bgr (no dither needed forms) */
2139 if ( (isBGR(srcFormat) || isRGB(srcFormat))
2140 && (isBGR(dstFormat) || isRGB(dstFormat))
2141 && srcFormat != PIX_FMT_BGR8 && dstFormat != PIX_FMT_BGR8
2142 && srcFormat != PIX_FMT_RGB8 && dstFormat != PIX_FMT_RGB8
2143 && srcFormat != PIX_FMT_BGR4 && dstFormat != PIX_FMT_BGR4
2144 && srcFormat != PIX_FMT_RGB4 && dstFormat != PIX_FMT_RGB4
2145 && srcFormat != PIX_FMT_BGR4_BYTE && dstFormat != PIX_FMT_BGR4_BYTE
2146 && srcFormat != PIX_FMT_RGB4_BYTE && dstFormat != PIX_FMT_RGB4_BYTE
2147 && srcFormat != PIX_FMT_MONOBLACK && dstFormat != PIX_FMT_MONOBLACK
2148 && !needsDither)
2149 c->swScale= rgb2rgbWrapper;
2150
2151 /* LQ converters if -sws 0 or -sws 4*/
2152 if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)){
2153 /* rgb/bgr -> rgb/bgr (dither needed forms) */
2154 if ( (isBGR(srcFormat) || isRGB(srcFormat))
2155 && (isBGR(dstFormat) || isRGB(dstFormat))
2156 && needsDither)
2157 c->swScale= rgb2rgbWrapper;
2158
2159 /* yv12_to_yuy2 */
2160 if (srcFormat == PIX_FMT_YUV420P &&
2161 (dstFormat == PIX_FMT_YUYV422 || dstFormat == PIX_FMT_UYVY422))
2162 {
2163 if (dstFormat == PIX_FMT_YUYV422)
2164 c->swScale= PlanarToYuy2Wrapper;
2165 else
2166 c->swScale= PlanarToUyvyWrapper;
2167 }
2168 }
2169
2170#ifdef COMPILE_ALTIVEC
2171 if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
2172 ((srcFormat == PIX_FMT_YUV420P &&
2173 (dstFormat == PIX_FMT_YUYV422 || dstFormat == PIX_FMT_UYVY422)))) {
2174 // unscaled YV12 -> packed YUV, we want speed
2175 if (dstFormat == PIX_FMT_YUYV422)
2176 c->swScale= yv12toyuy2_unscaled_altivec;
2177 else
2178 c->swScale= yv12touyvy_unscaled_altivec;
2179 }
2180#endif
2181
2182 /* simple copy */
2183 if ( srcFormat == dstFormat
2184 || (isPlanarYUV(srcFormat) && isGray(dstFormat))
2185 || (isPlanarYUV(dstFormat) && isGray(srcFormat)))
2186 {
2187 if (isPacked(c->srcFormat))
2188 c->swScale= packedCopy;
2189 else /* Planar YUV or gray */
2190 c->swScale= planarCopy;
2191 }
2192
2193 /* gray16{le,be} conversions */
2194 if (isGray16(srcFormat) && (isPlanarYUV(dstFormat) || (dstFormat == PIX_FMT_GRAY8)))
2195 {
2196 c->swScale= gray16togray;
2197 }
2198 if ((isPlanarYUV(srcFormat) || (srcFormat == PIX_FMT_GRAY8)) && isGray16(dstFormat))
2199 {
2200 c->swScale= graytogray16;
2201 }
2202 if (srcFormat != dstFormat && isGray16(srcFormat) && isGray16(dstFormat))
2203 {
2204 c->swScale= gray16swap;
2205 }
2206
2207#ifdef ARCH_BFIN
2208 if (flags & SWS_CPU_CAPS_BFIN)
2209 ff_bfin_get_unscaled_swscale (c);
2210#endif
2211
2212 if (c->swScale){
2213 if (flags&SWS_PRINT_INFO)
2214 av_log(c, AV_LOG_INFO, "using unscaled %s -> %s special converter\n",
2215 sws_format_name(srcFormat), sws_format_name(dstFormat));
2216 return c;
2217 }
2218 }
2219
2220 if (flags & SWS_CPU_CAPS_MMX2)
2221 {
2222 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2223 if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
2224 {
2225 if (flags&SWS_PRINT_INFO)
2226 av_log(c, AV_LOG_INFO, "output Width is not a multiple of 32 -> no MMX2 scaler\n");
2227 }
2228 if (usesHFilter) c->canMMX2BeUsed=0;
2229 }
2230 else
2231 c->canMMX2BeUsed=0;
2232
2233 c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
2234 c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
2235
2236 // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2237 // but only for the FAST_BILINEAR mode otherwise do correct scaling
2238 // n-2 is the last chrominance sample available
2239 // this is not perfect, but no one should notice the difference, the more correct variant
2240 // would be like the vertical one, but that would require some special code for the
2241 // first and last pixel
2242 if (flags&SWS_FAST_BILINEAR)
2243 {
2244 if (c->canMMX2BeUsed)
2245 {
2246 c->lumXInc+= 20;
2247 c->chrXInc+= 20;
2248 }
2249 //we don't use the x86asm scaler if mmx is available
2250 else if (flags & SWS_CPU_CAPS_MMX)
2251 {
2252 c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2253 c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
2254 }
2255 }
2256
2257 /* precalculate horizontal scaler filter coefficients */
2258 {
2259 const int filterAlign=
2260 (flags & SWS_CPU_CAPS_MMX) ? 4 :
2261 (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
2262 1;
2263
2264 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
2265 srcW , dstW, filterAlign, 1<<14,
2266 (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags,
2267 srcFilter->lumH, dstFilter->lumH, c->param);
2268 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
2269 c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
2270 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
2271 srcFilter->chrH, dstFilter->chrH, c->param);
2272
2273#define MAX_FUNNY_CODE_SIZE 10000
2274#if defined(COMPILE_MMX2)
2275// can't downscale !!!
2276 if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2277 {
2278#ifdef MAP_ANONYMOUS
2279 c->funnyYCode = (uint8_t*)mmap(NULL, MAX_FUNNY_CODE_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
2280 c->funnyUVCode = (uint8_t*)mmap(NULL, MAX_FUNNY_CODE_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
2281#else
2282 c->funnyYCode = av_malloc(MAX_FUNNY_CODE_SIZE);
2283 c->funnyUVCode = av_malloc(MAX_FUNNY_CODE_SIZE);
2284#endif
2285
2286 c->lumMmx2Filter = av_malloc((dstW /8+8)*sizeof(int16_t));
2287 c->chrMmx2Filter = av_malloc((c->chrDstW /4+8)*sizeof(int16_t));
2288 c->lumMmx2FilterPos= av_malloc((dstW /2/8+8)*sizeof(int32_t));
2289 c->chrMmx2FilterPos= av_malloc((c->chrDstW/2/4+8)*sizeof(int32_t));
2290
2291 initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
2292 initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
2293 }
2294#endif /* defined(COMPILE_MMX2) */
2295 } // Init Horizontal stuff
2296
2297
2298
2299 /* precalculate vertical scaler filter coefficients */
2300 {
2301 const int filterAlign=
2302 (flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
2303 (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
2304 1;
2305
2306 initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
2307 srcH , dstH, filterAlign, (1<<12)-4,
2308 (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags,
2309 srcFilter->lumV, dstFilter->lumV, c->param);
2310 initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
2311 c->chrSrcH, c->chrDstH, filterAlign, (1<<12)-4,
2312 (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
2313 srcFilter->chrV, dstFilter->chrV, c->param);
2314
2315#ifdef HAVE_ALTIVEC
2316 c->vYCoeffsBank = av_malloc(sizeof (vector signed short)*c->vLumFilterSize*c->dstH);
2317 c->vCCoeffsBank = av_malloc(sizeof (vector signed short)*c->vChrFilterSize*c->chrDstH);
2318
2319 for (i=0;i<c->vLumFilterSize*c->dstH;i++) {
2320 int j;
2321 short *p = (short *)&c->vYCoeffsBank[i];
2322 for (j=0;j<8;j++)
2323 p[j] = c->vLumFilter[i];
2324 }
2325
2326 for (i=0;i<c->vChrFilterSize*c->chrDstH;i++) {
2327 int j;
2328 short *p = (short *)&c->vCCoeffsBank[i];
2329 for (j=0;j<8;j++)
2330 p[j] = c->vChrFilter[i];
2331 }
2332#endif
2333 }
2334
2335 // Calculate Buffer Sizes so that they won't run out while handling these damn slices
2336 c->vLumBufSize= c->vLumFilterSize;
2337 c->vChrBufSize= c->vChrFilterSize;
2338 for (i=0; i<dstH; i++)
2339 {
2340 int chrI= i*c->chrDstH / dstH;
2341 int nextSlice= FFMAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1,
2342 ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
2343
2344 nextSlice>>= c->chrSrcVSubSample;
2345 nextSlice<<= c->chrSrcVSubSample;
2346 if (c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice)
2347 c->vLumBufSize= nextSlice - c->vLumFilterPos[i];
2348 if (c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
2349 c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
2350 }
2351
2352 // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2353 c->lumPixBuf= av_malloc(c->vLumBufSize*2*sizeof(int16_t*));
2354 c->chrPixBuf= av_malloc(c->vChrBufSize*2*sizeof(int16_t*));
2355 //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
2356 /* align at 16 bytes for AltiVec */
2357 for (i=0; i<c->vLumBufSize; i++)
2358 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= av_mallocz(VOF+1);
2359 for (i=0; i<c->vChrBufSize; i++)
2360 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= av_malloc((VOF+1)*2);
2361
2362 //try to avoid drawing green stuff between the right end and the stride end
2363 for (i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, (VOF+1)*2);
2364
2365 assert(2*VOFW == VOF);
2366
2367 assert(c->chrDstH <= dstH);
2368
2369 if (flags&SWS_PRINT_INFO)
2370 {
2371#ifdef DITHER1XBPP
2372 const char *dither= " dithered";
2373#else
2374 const char *dither= "";
2375#endif
2376 if (flags&SWS_FAST_BILINEAR)
2377 av_log(c, AV_LOG_INFO, "FAST_BILINEAR scaler, ");
2378 else if (flags&SWS_BILINEAR)
2379 av_log(c, AV_LOG_INFO, "BILINEAR scaler, ");
2380 else if (flags&SWS_BICUBIC)
2381 av_log(c, AV_LOG_INFO, "BICUBIC scaler, ");
2382 else if (flags&SWS_X)
2383 av_log(c, AV_LOG_INFO, "Experimental scaler, ");
2384 else if (flags&SWS_POINT)
2385 av_log(c, AV_LOG_INFO, "Nearest Neighbor / POINT scaler, ");
2386 else if (flags&SWS_AREA)
2387 av_log(c, AV_LOG_INFO, "Area Averageing scaler, ");
2388 else if (flags&SWS_BICUBLIN)
2389 av_log(c, AV_LOG_INFO, "luma BICUBIC / chroma BILINEAR scaler, ");
2390 else if (flags&SWS_GAUSS)
2391 av_log(c, AV_LOG_INFO, "Gaussian scaler, ");
2392 else if (flags&SWS_SINC)
2393 av_log(c, AV_LOG_INFO, "Sinc scaler, ");
2394 else if (flags&SWS_LANCZOS)
2395 av_log(c, AV_LOG_INFO, "Lanczos scaler, ");
2396 else if (flags&SWS_SPLINE)
2397 av_log(c, AV_LOG_INFO, "Bicubic spline scaler, ");
2398 else
2399 av_log(c, AV_LOG_INFO, "ehh flags invalid?! ");
2400
2401 if (dstFormat==PIX_FMT_BGR555 || dstFormat==PIX_FMT_BGR565)
2402 av_log(c, AV_LOG_INFO, "from %s to%s %s ",
2403 sws_format_name(srcFormat), dither, sws_format_name(dstFormat));
2404 else
2405 av_log(c, AV_LOG_INFO, "from %s to %s ",
2406 sws_format_name(srcFormat), sws_format_name(dstFormat));
2407
2408 if (flags & SWS_CPU_CAPS_MMX2)
2409 av_log(c, AV_LOG_INFO, "using MMX2\n");
2410 else if (flags & SWS_CPU_CAPS_3DNOW)
2411 av_log(c, AV_LOG_INFO, "using 3DNOW\n");
2412 else if (flags & SWS_CPU_CAPS_MMX)
2413 av_log(c, AV_LOG_INFO, "using MMX\n");
2414 else if (flags & SWS_CPU_CAPS_ALTIVEC)
2415 av_log(c, AV_LOG_INFO, "using AltiVec\n");
2416 else
2417 av_log(c, AV_LOG_INFO, "using C\n");
2418 }
2419
2420 if (flags & SWS_PRINT_INFO)
2421 {
2422 if (flags & SWS_CPU_CAPS_MMX)
2423 {
2424 if (c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
2425 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2426 else
2427 {
2428 if (c->hLumFilterSize==4)
2429 av_log(c, AV_LOG_VERBOSE, "using 4-tap MMX scaler for horizontal luminance scaling\n");
2430 else if (c->hLumFilterSize==8)
2431 av_log(c, AV_LOG_VERBOSE, "using 8-tap MMX scaler for horizontal luminance scaling\n");
2432 else
2433 av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal luminance scaling\n");
2434
2435 if (c->hChrFilterSize==4)
2436 av_log(c, AV_LOG_VERBOSE, "using 4-tap MMX scaler for horizontal chrominance scaling\n");
2437 else if (c->hChrFilterSize==8)
2438 av_log(c, AV_LOG_VERBOSE, "using 8-tap MMX scaler for horizontal chrominance scaling\n");
2439 else
2440 av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal chrominance scaling\n");
2441 }
2442 }
2443 else
2444 {
2445#if defined(ARCH_X86)
2446 av_log(c, AV_LOG_VERBOSE, "using X86-Asm scaler for horizontal scaling\n");
2447#else
2448 if (flags & SWS_FAST_BILINEAR)
2449 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR C scaler for horizontal scaling\n");
2450 else
2451 av_log(c, AV_LOG_VERBOSE, "using C scaler for horizontal scaling\n");
2452#endif
2453 }
2454 if (isPlanarYUV(dstFormat))
2455 {
2456 if (c->vLumFilterSize==1)
2457 av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2458 else
2459 av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2460 }
2461 else
2462 {
2463 if (c->vLumFilterSize==1 && c->vChrFilterSize==2)
2464 av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2465 " 2-tap scaler for vertical chrominance scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2466 else if (c->vLumFilterSize==2 && c->vChrFilterSize==2)
2467 av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2468 else
2469 av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2470 }
2471
2472 if (dstFormat==PIX_FMT_BGR24)
2473 av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 Converter\n",
2474 (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
2475 else if (dstFormat==PIX_FMT_RGB32)
2476 av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2477 else if (dstFormat==PIX_FMT_BGR565)
2478 av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2479 else if (dstFormat==PIX_FMT_BGR555)
2480 av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2481
2482 av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2483 }
2484 if (flags & SWS_PRINT_INFO)
2485 {
2486 av_log(c, AV_LOG_DEBUG, "Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2487 c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
2488 av_log(c, AV_LOG_DEBUG, "Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2489 c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2490 }
2491
2492 c->swScale= getSwsFunc(flags);
2493 return c;
2494}
2495
2496/**
2497 * swscale wrapper, so we don't need to export the SwsContext.
2498 * assumes planar YUV to be in YUV order instead of YVU
2499 */
2500int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2501 int srcSliceH, uint8_t* dst[], int dstStride[]){
2502 int i;
2503 uint8_t* src2[4]= {src[0], src[1], src[2]};
2504 uint32_t pal[256];
2505 if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
2506 av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
2507 return 0;
2508 }
2509 if (c->sliceDir == 0) {
2510 if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
2511 }
2512
2513 if (c->srcFormat == PIX_FMT_PAL8){
2514 for (i=0; i<256; i++){
2515 int p= ((uint32_t*)(src[1]))[i];
2516 int r= (p>>16)&0xFF;
2517 int g= (p>> 8)&0xFF;
2518 int b= p &0xFF;
2519 int y= av_clip_uint8(((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16 );
2520 int u= av_clip_uint8(((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128);
2521 int v= av_clip_uint8(((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128);
2522 pal[i]= y + (u<<8) + (v<<16);
2523 }
2524 src2[1]= (uint8_t*)pal;
2525 }
2526
2527 // copy strides, so they can safely be modified
2528 if (c->sliceDir == 1) {
2529 // slices go from top to bottom
2530 int srcStride2[4]= {srcStride[0], srcStride[1], srcStride[2]};
2531 int dstStride2[4]= {dstStride[0], dstStride[1], dstStride[2]};
2532 return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst, dstStride2);
2533 } else {
2534 // slices go from bottom to top => we flip the image internally
2535 uint8_t* dst2[4]= {dst[0] + (c->dstH-1)*dstStride[0],
2536 dst[1] + ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[1],
2537 dst[2] + ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[2]};
2538 int srcStride2[4]= {-srcStride[0], -srcStride[1], -srcStride[2]};
2539 int dstStride2[4]= {-dstStride[0], -dstStride[1], -dstStride[2]};
2540
2541 src2[0] += (srcSliceH-1)*srcStride[0];
2542 if (c->srcFormat != PIX_FMT_PAL8)
2543 src2[1] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[1];
2544 src2[2] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[2];
2545
2546 return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH, srcSliceH, dst2, dstStride2);
2547 }
2548}
2549
2550/**
2551 * swscale wrapper, so we don't need to export the SwsContext
2552 */
2553int sws_scale_ordered(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2554 int srcSliceH, uint8_t* dst[], int dstStride[]){
2555 return sws_scale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride);
2556}
2557
2558SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
2559 float lumaSharpen, float chromaSharpen,
2560 float chromaHShift, float chromaVShift,
2561 int verbose)
2562{
2563 SwsFilter *filter= av_malloc(sizeof(SwsFilter));
2564
2565 if (lumaGBlur!=0.0){
2566 filter->lumH= sws_getGaussianVec(lumaGBlur, 3.0);
2567 filter->lumV= sws_getGaussianVec(lumaGBlur, 3.0);
2568 }else{
2569 filter->lumH= sws_getIdentityVec();
2570 filter->lumV= sws_getIdentityVec();
2571 }
2572
2573 if (chromaGBlur!=0.0){
2574 filter->chrH= sws_getGaussianVec(chromaGBlur, 3.0);
2575 filter->chrV= sws_getGaussianVec(chromaGBlur, 3.0);
2576 }else{
2577 filter->chrH= sws_getIdentityVec();
2578 filter->chrV= sws_getIdentityVec();
2579 }
2580
2581 if (chromaSharpen!=0.0){
2582 SwsVector *id= sws_getIdentityVec();
2583 sws_scaleVec(filter->chrH, -chromaSharpen);
2584 sws_scaleVec(filter->chrV, -chromaSharpen);
2585 sws_addVec(filter->chrH, id);
2586 sws_addVec(filter->chrV, id);
2587 sws_freeVec(id);
2588 }
2589
2590 if (lumaSharpen!=0.0){
2591 SwsVector *id= sws_getIdentityVec();
2592 sws_scaleVec(filter->lumH, -lumaSharpen);
2593 sws_scaleVec(filter->lumV, -lumaSharpen);
2594 sws_addVec(filter->lumH, id);
2595 sws_addVec(filter->lumV, id);
2596 sws_freeVec(id);
2597 }
2598
2599 if (chromaHShift != 0.0)
2600 sws_shiftVec(filter->chrH, (int)(chromaHShift+0.5));
2601
2602 if (chromaVShift != 0.0)
2603 sws_shiftVec(filter->chrV, (int)(chromaVShift+0.5));
2604
2605 sws_normalizeVec(filter->chrH, 1.0);
2606 sws_normalizeVec(filter->chrV, 1.0);
2607 sws_normalizeVec(filter->lumH, 1.0);
2608 sws_normalizeVec(filter->lumV, 1.0);
2609
2610 if (verbose) sws_printVec(filter->chrH);
2611 if (verbose) sws_printVec(filter->lumH);
2612
2613 return filter;
2614}
2615
2616/**
2617 * returns a normalized gaussian curve used to filter stuff
2618 * quality=3 is high quality, lowwer is lowwer quality
2619 */
2620SwsVector *sws_getGaussianVec(double variance, double quality){
2621 const int length= (int)(variance*quality + 0.5) | 1;
2622 int i;
2623 double *coeff= av_malloc(length*sizeof(double));
2624 double middle= (length-1)*0.5;
2625 SwsVector *vec= av_malloc(sizeof(SwsVector));
2626
2627 vec->coeff= coeff;
2628 vec->length= length;
2629
2630 for (i=0; i<length; i++)
2631 {
2632 double dist= i-middle;
2633 coeff[i]= exp(-dist*dist/(2*variance*variance)) / sqrt(2*variance*PI);
2634 }
2635
2636 sws_normalizeVec(vec, 1.0);
2637
2638 return vec;
2639}
2640
2641SwsVector *sws_getConstVec(double c, int length){
2642 int i;
2643 double *coeff= av_malloc(length*sizeof(double));
2644 SwsVector *vec= av_malloc(sizeof(SwsVector));
2645
2646 vec->coeff= coeff;
2647 vec->length= length;
2648
2649 for (i=0; i<length; i++)
2650 coeff[i]= c;
2651
2652 return vec;
2653}
2654
2655
2656SwsVector *sws_getIdentityVec(void){
2657 return sws_getConstVec(1.0, 1);
2658}
2659
2660double sws_dcVec(SwsVector *a){
2661 int i;
2662 double sum=0;
2663
2664 for (i=0; i<a->length; i++)
2665 sum+= a->coeff[i];
2666
2667 return sum;
2668}
2669
2670void sws_scaleVec(SwsVector *a, double scalar){
2671 int i;
2672
2673 for (i=0; i<a->length; i++)
2674 a->coeff[i]*= scalar;
2675}
2676
2677void sws_normalizeVec(SwsVector *a, double height){
2678 sws_scaleVec(a, height/sws_dcVec(a));
2679}
2680
2681static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b){
2682 int length= a->length + b->length - 1;
2683 double *coeff= av_malloc(length*sizeof(double));
2684 int i, j;
2685 SwsVector *vec= av_malloc(sizeof(SwsVector));
2686
2687 vec->coeff= coeff;
2688 vec->length= length;
2689
2690 for (i=0; i<length; i++) coeff[i]= 0.0;
2691
2692 for (i=0; i<a->length; i++)
2693 {
2694 for (j=0; j<b->length; j++)
2695 {
2696 coeff[i+j]+= a->coeff[i]*b->coeff[j];
2697 }
2698 }
2699
2700 return vec;
2701}
2702
2703static SwsVector *sws_sumVec(SwsVector *a, SwsVector *b){
2704 int length= FFMAX(a->length, b->length);
2705 double *coeff= av_malloc(length*sizeof(double));
2706 int i;
2707 SwsVector *vec= av_malloc(sizeof(SwsVector));
2708
2709 vec->coeff= coeff;
2710 vec->length= length;
2711
2712 for (i=0; i<length; i++) coeff[i]= 0.0;
2713
2714 for (i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2715 for (i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
2716
2717 return vec;
2718}
2719
2720static SwsVector *sws_diffVec(SwsVector *a, SwsVector *b){
2721 int length= FFMAX(a->length, b->length);
2722 double *coeff= av_malloc(length*sizeof(double));
2723 int i;
2724 SwsVector *vec= av_malloc(sizeof(SwsVector));
2725
2726 vec->coeff= coeff;
2727 vec->length= length;
2728
2729 for (i=0; i<length; i++) coeff[i]= 0.0;
2730
2731 for (i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2732 for (i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
2733
2734 return vec;
2735}
2736
2737/* shift left / or right if "shift" is negative */
2738static SwsVector *sws_getShiftedVec(SwsVector *a, int shift){
2739 int length= a->length + FFABS(shift)*2;
2740 double *coeff= av_malloc(length*sizeof(double));
2741 int i;
2742 SwsVector *vec= av_malloc(sizeof(SwsVector));
2743
2744 vec->coeff= coeff;
2745 vec->length= length;
2746
2747 for (i=0; i<length; i++) coeff[i]= 0.0;
2748
2749 for (i=0; i<a->length; i++)
2750 {
2751 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
2752 }
2753
2754 return vec;
2755}
2756
2757void sws_shiftVec(SwsVector *a, int shift){
2758 SwsVector *shifted= sws_getShiftedVec(a, shift);
2759 av_free(a->coeff);
2760 a->coeff= shifted->coeff;
2761 a->length= shifted->length;
2762 av_free(shifted);
2763}
2764
2765void sws_addVec(SwsVector *a, SwsVector *b){
2766 SwsVector *sum= sws_sumVec(a, b);
2767 av_free(a->coeff);
2768 a->coeff= sum->coeff;
2769 a->length= sum->length;
2770 av_free(sum);
2771}
2772
2773void sws_subVec(SwsVector *a, SwsVector *b){
2774 SwsVector *diff= sws_diffVec(a, b);
2775 av_free(a->coeff);
2776 a->coeff= diff->coeff;
2777 a->length= diff->length;
2778 av_free(diff);
2779}
2780
2781void sws_convVec(SwsVector *a, SwsVector *b){
2782 SwsVector *conv= sws_getConvVec(a, b);
2783 av_free(a->coeff);
2784 a->coeff= conv->coeff;
2785 a->length= conv->length;
2786 av_free(conv);
2787}
2788
2789SwsVector *sws_cloneVec(SwsVector *a){
2790 double *coeff= av_malloc(a->length*sizeof(double));
2791 int i;
2792 SwsVector *vec= av_malloc(sizeof(SwsVector));
2793
2794 vec->coeff= coeff;
2795 vec->length= a->length;
2796
2797 for (i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2798
2799 return vec;
2800}
2801
2802void sws_printVec(SwsVector *a){
2803 int i;
2804 double max=0;
2805 double min=0;
2806 double range;
2807
2808 for (i=0; i<a->length; i++)
2809 if (a->coeff[i]>max) max= a->coeff[i];
2810
2811 for (i=0; i<a->length; i++)
2812 if (a->coeff[i]<min) min= a->coeff[i];
2813
2814 range= max - min;
2815
2816 for (i=0; i<a->length; i++)
2817 {
2818 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
2819 av_log(NULL, AV_LOG_DEBUG, "%1.3f ", a->coeff[i]);
2820 for (;x>0; x--) av_log(NULL, AV_LOG_DEBUG, " ");
2821 av_log(NULL, AV_LOG_DEBUG, "|\n");
2822 }
2823}
2824
2825void sws_freeVec(SwsVector *a){
2826 if (!a) return;
2827 av_freep(&a->coeff);
2828 a->length=0;
2829 av_free(a);
2830}
2831
2832void sws_freeFilter(SwsFilter *filter){
2833 if (!filter) return;
2834
2835 if (filter->lumH) sws_freeVec(filter->lumH);
2836 if (filter->lumV) sws_freeVec(filter->lumV);
2837 if (filter->chrH) sws_freeVec(filter->chrH);
2838 if (filter->chrV) sws_freeVec(filter->chrV);
2839 av_free(filter);
2840}
2841
2842
2843void sws_freeContext(SwsContext *c){
2844 int i;
2845 if (!c) return;
2846
2847 if (c->lumPixBuf)
2848 {
2849 for (i=0; i<c->vLumBufSize; i++)
2850 av_freep(&c->lumPixBuf[i]);
2851 av_freep(&c->lumPixBuf);
2852 }
2853
2854 if (c->chrPixBuf)
2855 {
2856 for (i=0; i<c->vChrBufSize; i++)
2857 av_freep(&c->chrPixBuf[i]);
2858 av_freep(&c->chrPixBuf);
2859 }
2860
2861 av_freep(&c->vLumFilter);
2862 av_freep(&c->vChrFilter);
2863 av_freep(&c->hLumFilter);
2864 av_freep(&c->hChrFilter);
2865#ifdef HAVE_ALTIVEC
2866 av_freep(&c->vYCoeffsBank);
2867 av_freep(&c->vCCoeffsBank);
2868#endif
2869
2870 av_freep(&c->vLumFilterPos);
2871 av_freep(&c->vChrFilterPos);
2872 av_freep(&c->hLumFilterPos);
2873 av_freep(&c->hChrFilterPos);
2874
2875#if defined(ARCH_X86) && defined(CONFIG_GPL)
2876#ifdef MAP_ANONYMOUS
2877 if (c->funnyYCode) munmap(c->funnyYCode, MAX_FUNNY_CODE_SIZE);
2878 if (c->funnyUVCode) munmap(c->funnyUVCode, MAX_FUNNY_CODE_SIZE);
2879#else
2880 av_free(c->funnyYCode);
2881 av_free(c->funnyUVCode);
2882#endif
2883 c->funnyYCode=NULL;
2884 c->funnyUVCode=NULL;
2885#endif /* defined(ARCH_X86) */
2886
2887 av_freep(&c->lumMmx2Filter);
2888 av_freep(&c->chrMmx2Filter);
2889 av_freep(&c->lumMmx2FilterPos);
2890 av_freep(&c->chrMmx2FilterPos);
2891 av_freep(&c->yuvTable);
2892
2893 av_free(c);
2894}
2895
2896/**
2897 * Checks if context is valid or reallocs a new one instead.
2898 * If context is NULL, just calls sws_getContext() to get a new one.
2899 * Otherwise, checks if the parameters are the same already saved in context.
2900 * If that is the case, returns the current context.
2901 * Otherwise, frees context and gets a new one.
2902 *
2903 * Be warned that srcFilter, dstFilter are not checked, they are
2904 * asumed to remain valid.
2905 */
2906struct SwsContext *sws_getCachedContext(struct SwsContext *context,
2907 int srcW, int srcH, int srcFormat,
2908 int dstW, int dstH, int dstFormat, int flags,
2909 SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param)
2910{
2911 static const double default_param[2] = {SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT};
2912
2913 if (!param)
2914 param = default_param;
2915
2916 if (context) {
2917 if (context->srcW != srcW || context->srcH != srcH ||
2918 context->srcFormat != srcFormat ||
2919 context->dstW != dstW || context->dstH != dstH ||
2920 context->dstFormat != dstFormat || context->flags != flags ||
2921 context->param[0] != param[0] || context->param[1] != param[1])
2922 {
2923 sws_freeContext(context);
2924 context = NULL;
2925 }
2926 }
2927 if (!context) {
2928 return sws_getContext(srcW, srcH, srcFormat,
2929 dstW, dstH, dstFormat, flags,
2930 srcFilter, dstFilter, param);
2931 }
2932 return context;
2933}
2934
diff --git a/src/plugins/ffmpeg/libswscale/swscale.h b/src/plugins/ffmpeg/libswscale/swscale.h
deleted file mode 100644
index 3a5b460..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale.h
+++ /dev/null
@@ -1,146 +0,0 @@
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#ifndef FFMPEG_SWSCALE_H
22#define FFMPEG_SWSCALE_H
23
24/**
25 * @file swscale.h
26 * @brief
27 * external api for the swscale stuff
28 */
29
30#include "libavutil/avutil.h"
31
32#define LIBSWSCALE_VERSION_MAJOR 0
33#define LIBSWSCALE_VERSION_MINOR 5
34#define LIBSWSCALE_VERSION_MICRO 1
35
36#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
37 LIBSWSCALE_VERSION_MINOR, \
38 LIBSWSCALE_VERSION_MICRO)
39#define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
40 LIBSWSCALE_VERSION_MINOR, \
41 LIBSWSCALE_VERSION_MICRO)
42#define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT
43
44#define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
45
46/* values for the flags, the stuff on the command line is different */
47#define SWS_FAST_BILINEAR 1
48#define SWS_BILINEAR 2
49#define SWS_BICUBIC 4
50#define SWS_X 8
51#define SWS_POINT 0x10
52#define SWS_AREA 0x20
53#define SWS_BICUBLIN 0x40
54#define SWS_GAUSS 0x80
55#define SWS_SINC 0x100
56#define SWS_LANCZOS 0x200
57#define SWS_SPLINE 0x400
58
59#define SWS_SRC_V_CHR_DROP_MASK 0x30000
60#define SWS_SRC_V_CHR_DROP_SHIFT 16
61
62#define SWS_PARAM_DEFAULT 123456
63
64#define SWS_PRINT_INFO 0x1000
65
66//the following 3 flags are not completely implemented
67//internal chrominace subsampling info
68#define SWS_FULL_CHR_H_INT 0x2000
69//input subsampling info
70#define SWS_FULL_CHR_H_INP 0x4000
71#define SWS_DIRECT_BGR 0x8000
72#define SWS_ACCURATE_RND 0x40000
73
74#define SWS_CPU_CAPS_MMX 0x80000000
75#define SWS_CPU_CAPS_MMX2 0x20000000
76#define SWS_CPU_CAPS_3DNOW 0x40000000
77#define SWS_CPU_CAPS_ALTIVEC 0x10000000
78#define SWS_CPU_CAPS_BFIN 0x01000000
79
80#define SWS_MAX_REDUCE_CUTOFF 0.002
81
82#define SWS_CS_ITU709 1
83#define SWS_CS_FCC 4
84#define SWS_CS_ITU601 5
85#define SWS_CS_ITU624 5
86#define SWS_CS_SMPTE170M 5
87#define SWS_CS_SMPTE240M 7
88#define SWS_CS_DEFAULT 5
89
90
91
92// when used for filters they must have an odd number of elements
93// coeffs cannot be shared between vectors
94typedef struct {
95 double *coeff;
96 int length;
97} SwsVector;
98
99// vectors can be shared
100typedef struct {
101 SwsVector *lumH;
102 SwsVector *lumV;
103 SwsVector *chrH;
104 SwsVector *chrV;
105} SwsFilter;
106
107struct SwsContext;
108
109void sws_freeContext(struct SwsContext *swsContext);
110
111struct SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
112 SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param);
113int sws_scale(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
114 int srcSliceH, uint8_t* dst[], int dstStride[]);
115int sws_scale_ordered(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
116 int srcSliceH, uint8_t* dst[], int dstStride[]) attribute_deprecated;
117
118
119int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
120int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);
121SwsVector *sws_getGaussianVec(double variance, double quality);
122SwsVector *sws_getConstVec(double c, int length);
123SwsVector *sws_getIdentityVec(void);
124void sws_scaleVec(SwsVector *a, double scalar);
125void sws_normalizeVec(SwsVector *a, double height);
126void sws_convVec(SwsVector *a, SwsVector *b);
127void sws_addVec(SwsVector *a, SwsVector *b);
128void sws_subVec(SwsVector *a, SwsVector *b);
129void sws_shiftVec(SwsVector *a, int shift);
130SwsVector *sws_cloneVec(SwsVector *a);
131
132void sws_printVec(SwsVector *a);
133void sws_freeVec(SwsVector *a);
134
135SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
136 float lumaSarpen, float chromaSharpen,
137 float chromaHShift, float chromaVShift,
138 int verbose);
139void sws_freeFilter(SwsFilter *filter);
140
141struct SwsContext *sws_getCachedContext(struct SwsContext *context,
142 int srcW, int srcH, int srcFormat,
143 int dstW, int dstH, int dstFormat, int flags,
144 SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param);
145
146#endif /* FFMPEG_SWSCALE_H */
diff --git a/src/plugins/ffmpeg/libswscale/swscale_altivec_template.c b/src/plugins/ffmpeg/libswscale/swscale_altivec_template.c
deleted file mode 100644
index 2111cec..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale_altivec_template.c
+++ /dev/null
@@ -1,538 +0,0 @@
1/*
2 * AltiVec-enhanced yuv2yuvX
3 *
4 * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 * based on the equivalent C code in swscale.c
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#define vzero vec_splat_s32(0)
25
26static inline void
27altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) {
28 register int i;
29 vector unsigned int altivec_vectorShiftInt19 =
30 vec_add(vec_splat_u32(10), vec_splat_u32(9));
31 if ((unsigned long)dest % 16) {
32 /* badly aligned store, we force store alignment */
33 /* and will handle load misalignment on val w/ vec_perm */
34 vector unsigned char perm1;
35 vector signed int v1;
36 for (i = 0 ; (i < dstW) &&
37 (((unsigned long)dest + i) % 16) ; i++) {
38 int t = val[i] >> 19;
39 dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
40 }
41 perm1 = vec_lvsl(i << 2, val);
42 v1 = vec_ld(i << 2, val);
43 for ( ; i < (dstW - 15); i+=16) {
44 int offset = i << 2;
45 vector signed int v2 = vec_ld(offset + 16, val);
46 vector signed int v3 = vec_ld(offset + 32, val);
47 vector signed int v4 = vec_ld(offset + 48, val);
48 vector signed int v5 = vec_ld(offset + 64, val);
49 vector signed int v12 = vec_perm(v1, v2, perm1);
50 vector signed int v23 = vec_perm(v2, v3, perm1);
51 vector signed int v34 = vec_perm(v3, v4, perm1);
52 vector signed int v45 = vec_perm(v4, v5, perm1);
53
54 vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19);
55 vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19);
56 vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19);
57 vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19);
58 vector unsigned short vs1 = vec_packsu(vA, vB);
59 vector unsigned short vs2 = vec_packsu(vC, vD);
60 vector unsigned char vf = vec_packsu(vs1, vs2);
61 vec_st(vf, i, dest);
62 v1 = v5;
63 }
64 } else { // dest is properly aligned, great
65 for (i = 0; i < (dstW - 15); i+=16) {
66 int offset = i << 2;
67 vector signed int v1 = vec_ld(offset, val);
68 vector signed int v2 = vec_ld(offset + 16, val);
69 vector signed int v3 = vec_ld(offset + 32, val);
70 vector signed int v4 = vec_ld(offset + 48, val);
71 vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19);
72 vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19);
73 vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19);
74 vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19);
75 vector unsigned short vs1 = vec_packsu(v5, v6);
76 vector unsigned short vs2 = vec_packsu(v7, v8);
77 vector unsigned char vf = vec_packsu(vs1, vs2);
78 vec_st(vf, i, dest);
79 }
80 }
81 for ( ; i < dstW ; i++) {
82 int t = val[i] >> 19;
83 dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t);
84 }
85}
86
87static inline void
88yuv2yuvX_altivec_real(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
89 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
90 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
91{
92 const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)};
93 register int i, j;
94 {
95 int __attribute__ ((aligned (16))) val[dstW];
96
97 for (i = 0; i < (dstW -7); i+=4) {
98 vec_st(vini, i << 2, val);
99 }
100 for (; i < dstW; i++) {
101 val[i] = (1 << 18);
102 }
103
104 for (j = 0; j < lumFilterSize; j++) {
105 vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter);
106 vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter);
107 vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
108 vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
109
110 perm = vec_lvsl(0, lumSrc[j]);
111 l1 = vec_ld(0, lumSrc[j]);
112
113 for (i = 0; i < (dstW - 7); i+=8) {
114 int offset = i << 2;
115 vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]);
116
117 vector signed int v1 = vec_ld(offset, val);
118 vector signed int v2 = vec_ld(offset + 16, val);
119
120 vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7]
121
122 vector signed int i1 = vec_mule(vLumFilter, ls);
123 vector signed int i2 = vec_mulo(vLumFilter, ls);
124
125 vector signed int vf1 = vec_mergeh(i1, i2);
126 vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j]
127
128 vector signed int vo1 = vec_add(v1, vf1);
129 vector signed int vo2 = vec_add(v2, vf2);
130
131 vec_st(vo1, offset, val);
132 vec_st(vo2, offset + 16, val);
133
134 l1 = l2;
135 }
136 for ( ; i < dstW; i++) {
137 val[i] += lumSrc[j][i] * lumFilter[j];
138 }
139 }
140 altivec_packIntArrayToCharArray(val, dest, dstW);
141 }
142 if (uDest != 0) {
143 int __attribute__ ((aligned (16))) u[chrDstW];
144 int __attribute__ ((aligned (16))) v[chrDstW];
145
146 for (i = 0; i < (chrDstW -7); i+=4) {
147 vec_st(vini, i << 2, u);
148 vec_st(vini, i << 2, v);
149 }
150 for (; i < chrDstW; i++) {
151 u[i] = (1 << 18);
152 v[i] = (1 << 18);
153 }
154
155 for (j = 0; j < chrFilterSize; j++) {
156 vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter);
157 vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter);
158 vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0);
159 vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter
160
161 perm = vec_lvsl(0, chrSrc[j]);
162 l1 = vec_ld(0, chrSrc[j]);
163 l1_V = vec_ld(2048 << 1, chrSrc[j]);
164
165 for (i = 0; i < (chrDstW - 7); i+=8) {
166 int offset = i << 2;
167 vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]);
168 vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]);
169
170 vector signed int v1 = vec_ld(offset, u);
171 vector signed int v2 = vec_ld(offset + 16, u);
172 vector signed int v1_V = vec_ld(offset, v);
173 vector signed int v2_V = vec_ld(offset + 16, v);
174
175 vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7]
176 vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055]
177
178 vector signed int i1 = vec_mule(vChrFilter, ls);
179 vector signed int i2 = vec_mulo(vChrFilter, ls);
180 vector signed int i1_V = vec_mule(vChrFilter, ls_V);
181 vector signed int i2_V = vec_mulo(vChrFilter, ls_V);
182
183 vector signed int vf1 = vec_mergeh(i1, i2);
184 vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
185 vector signed int vf1_V = vec_mergeh(i1_V, i2_V);
186 vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j]
187
188 vector signed int vo1 = vec_add(v1, vf1);
189 vector signed int vo2 = vec_add(v2, vf2);
190 vector signed int vo1_V = vec_add(v1_V, vf1_V);
191 vector signed int vo2_V = vec_add(v2_V, vf2_V);
192
193 vec_st(vo1, offset, u);
194 vec_st(vo2, offset + 16, u);
195 vec_st(vo1_V, offset, v);
196 vec_st(vo2_V, offset + 16, v);
197
198 l1 = l2;
199 l1_V = l2_V;
200 }
201 for ( ; i < chrDstW; i++) {
202 u[i] += chrSrc[j][i] * chrFilter[j];
203 v[i] += chrSrc[j][i + 2048] * chrFilter[j];
204 }
205 }
206 altivec_packIntArrayToCharArray(u, uDest, chrDstW);
207 altivec_packIntArrayToCharArray(v, vDest, chrDstW);
208 }
209}
210
211static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) {
212 register int i;
213 int __attribute__ ((aligned (16))) tempo[4];
214
215 if (filterSize % 4) {
216 for (i=0; i<dstW; i++) {
217 register int j;
218 register int srcPos = filterPos[i];
219 register int val = 0;
220 for (j=0; j<filterSize; j++) {
221 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
222 }
223 dst[i] = av_clip(val>>7, 0, (1<<15)-1);
224 }
225 }
226 else
227 switch (filterSize) {
228 case 4:
229 {
230 for (i=0; i<dstW; i++) {
231 register int srcPos = filterPos[i];
232
233 vector unsigned char src_v0 = vec_ld(srcPos, src);
234 vector unsigned char src_v1, src_vF;
235 vector signed short src_v, filter_v;
236 vector signed int val_vEven, val_s;
237 if ((((int)src + srcPos)% 16) > 12) {
238 src_v1 = vec_ld(srcPos + 16, src);
239 }
240 src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
241
242 src_v = // vec_unpackh sign-extends...
243 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
244 // now put our elements in the even slots
245 src_v = vec_mergeh(src_v, (vector signed short)vzero);
246
247 filter_v = vec_ld(i << 3, filter);
248 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
249
250 // The neat trick: We only care for half the elements,
251 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
252 // and we're going to use vec_mule, so we choose
253 // carefully how to "unpack" the elements into the even slots.
254 if ((i << 3) % 16)
255 filter_v = vec_mergel(filter_v, (vector signed short)vzero);
256 else
257 filter_v = vec_mergeh(filter_v, (vector signed short)vzero);
258
259 val_vEven = vec_mule(src_v, filter_v);
260 val_s = vec_sums(val_vEven, vzero);
261 vec_st(val_s, 0, tempo);
262 dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
263 }
264 }
265 break;
266
267 case 8:
268 {
269 for (i=0; i<dstW; i++) {
270 register int srcPos = filterPos[i];
271
272 vector unsigned char src_v0 = vec_ld(srcPos, src);
273 vector unsigned char src_v1, src_vF;
274 vector signed short src_v, filter_v;
275 vector signed int val_v, val_s;
276 if ((((int)src + srcPos)% 16) > 8) {
277 src_v1 = vec_ld(srcPos + 16, src);
278 }
279 src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
280
281 src_v = // vec_unpackh sign-extends...
282 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
283 filter_v = vec_ld(i << 4, filter);
284 // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
285
286 val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
287 val_s = vec_sums(val_v, vzero);
288 vec_st(val_s, 0, tempo);
289 dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
290 }
291 }
292 break;
293
294 case 16:
295 {
296 for (i=0; i<dstW; i++) {
297 register int srcPos = filterPos[i];
298
299 vector unsigned char src_v0 = vec_ld(srcPos, src);
300 vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
301 vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
302
303 vector signed short src_vA = // vec_unpackh sign-extends...
304 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
305 vector signed short src_vB = // vec_unpackh sign-extends...
306 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
307
308 vector signed short filter_v0 = vec_ld(i << 5, filter);
309 vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
310 // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
311
312 vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
313 vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
314
315 vector signed int val_s = vec_sums(val_v, vzero);
316
317 vec_st(val_s, 0, tempo);
318 dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
319 }
320 }
321 break;
322
323 default:
324 {
325 for (i=0; i<dstW; i++) {
326 register int j;
327 register int srcPos = filterPos[i];
328
329 vector signed int val_s, val_v = (vector signed int)vzero;
330 vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
331 vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);
332
333 vector unsigned char src_v0 = vec_ld(srcPos, src);
334 vector unsigned char permS = vec_lvsl(srcPos, src);
335
336 for (j = 0 ; j < filterSize - 15; j += 16) {
337 vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
338 vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
339
340 vector signed short src_vA = // vec_unpackh sign-extends...
341 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
342 vector signed short src_vB = // vec_unpackh sign-extends...
343 (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
344
345 vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
346 vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
347 vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF);
348 vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF);
349
350 vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
351 val_v = vec_msums(src_vB, filter_v1, val_acc);
352
353 filter_v0R = filter_v2R;
354 src_v0 = src_v1;
355 }
356
357 if (j < filterSize-7) {
358 // loading src_v0 is useless, it's already done above
359 //vector unsigned char src_v0 = vec_ld(srcPos + j, src);
360 vector unsigned char src_v1, src_vF;
361 vector signed short src_v, filter_v1R, filter_v;
362 if ((((int)src + srcPos)% 16) > 8) {
363 src_v1 = vec_ld(srcPos + j + 16, src);
364 }
365 src_vF = vec_perm(src_v0, src_v1, permS);
366
367 src_v = // vec_unpackh sign-extends...
368 (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
369 // loading filter_v0R is useless, it's already done above
370 //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
371 filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
372 filter_v = vec_perm(filter_v0R, filter_v1R, permF);
373
374 val_v = vec_msums(src_v, filter_v, val_v);
375 }
376
377 val_s = vec_sums(val_v, vzero);
378
379 vec_st(val_s, 0, tempo);
380 dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
381 }
382
383 }
384 }
385}
386
387static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
388 int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
389 uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
390 // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
391 uint8_t *ysrc = src[0];
392 uint8_t *usrc = src[1];
393 uint8_t *vsrc = src[2];
394 const int width = c->srcW;
395 const int height = srcSliceH;
396 const int lumStride = srcStride[0];
397 const int chromStride = srcStride[1];
398 const int dstStride = dstStride_a[0];
399 const vector unsigned char yperm = vec_lvsl(0, ysrc);
400 const int vertLumPerChroma = 2;
401 register unsigned int y;
402
403 if (width&15) {
404 yv12toyuy2(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride);
405 return srcSliceH;
406 }
407
408 /* This code assumes:
409
410 1) dst is 16 bytes-aligned
411 2) dstStride is a multiple of 16
412 3) width is a multiple of 16
413 4) lum & chrom stride are multiples of 8
414 */
415
416 for (y=0; y<height; y++) {
417 int i;
418 for (i = 0; i < width - 31; i+= 32) {
419 const unsigned int j = i >> 1;
420 vector unsigned char v_yA = vec_ld(i, ysrc);
421 vector unsigned char v_yB = vec_ld(i + 16, ysrc);
422 vector unsigned char v_yC = vec_ld(i + 32, ysrc);
423 vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
424 vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
425 vector unsigned char v_uA = vec_ld(j, usrc);
426 vector unsigned char v_uB = vec_ld(j + 16, usrc);
427 vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
428 vector unsigned char v_vA = vec_ld(j, vsrc);
429 vector unsigned char v_vB = vec_ld(j + 16, vsrc);
430 vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
431 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
432 vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
433 vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
434 vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
435 vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
436 vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
437 vec_st(v_yuy2_0, (i << 1), dst);
438 vec_st(v_yuy2_1, (i << 1) + 16, dst);
439 vec_st(v_yuy2_2, (i << 1) + 32, dst);
440 vec_st(v_yuy2_3, (i << 1) + 48, dst);
441 }
442 if (i < width) {
443 const unsigned int j = i >> 1;
444 vector unsigned char v_y1 = vec_ld(i, ysrc);
445 vector unsigned char v_u = vec_ld(j, usrc);
446 vector unsigned char v_v = vec_ld(j, vsrc);
447 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
448 vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
449 vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
450 vec_st(v_yuy2_0, (i << 1), dst);
451 vec_st(v_yuy2_1, (i << 1) + 16, dst);
452 }
453 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
454 usrc += chromStride;
455 vsrc += chromStride;
456 }
457 ysrc += lumStride;
458 dst += dstStride;
459 }
460
461 return srcSliceH;
462}
463
464static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
465 int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) {
466 uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY;
467 // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
468 uint8_t *ysrc = src[0];
469 uint8_t *usrc = src[1];
470 uint8_t *vsrc = src[2];
471 const int width = c->srcW;
472 const int height = srcSliceH;
473 const int lumStride = srcStride[0];
474 const int chromStride = srcStride[1];
475 const int dstStride = dstStride_a[0];
476 const int vertLumPerChroma = 2;
477 const vector unsigned char yperm = vec_lvsl(0, ysrc);
478 register unsigned int y;
479
480 if (width&15) {
481 yv12touyvy(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride);
482 return srcSliceH;
483 }
484
485 /* This code assumes:
486
487 1) dst is 16 bytes-aligned
488 2) dstStride is a multiple of 16
489 3) width is a multiple of 16
490 4) lum & chrom stride are multiples of 8
491 */
492
493 for (y=0; y<height; y++) {
494 int i;
495 for (i = 0; i < width - 31; i+= 32) {
496 const unsigned int j = i >> 1;
497 vector unsigned char v_yA = vec_ld(i, ysrc);
498 vector unsigned char v_yB = vec_ld(i + 16, ysrc);
499 vector unsigned char v_yC = vec_ld(i + 32, ysrc);
500 vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
501 vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
502 vector unsigned char v_uA = vec_ld(j, usrc);
503 vector unsigned char v_uB = vec_ld(j + 16, usrc);
504 vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
505 vector unsigned char v_vA = vec_ld(j, vsrc);
506 vector unsigned char v_vB = vec_ld(j + 16, vsrc);
507 vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
508 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
509 vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
510 vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
511 vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
512 vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
513 vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
514 vec_st(v_uyvy_0, (i << 1), dst);
515 vec_st(v_uyvy_1, (i << 1) + 16, dst);
516 vec_st(v_uyvy_2, (i << 1) + 32, dst);
517 vec_st(v_uyvy_3, (i << 1) + 48, dst);
518 }
519 if (i < width) {
520 const unsigned int j = i >> 1;
521 vector unsigned char v_y1 = vec_ld(i, ysrc);
522 vector unsigned char v_u = vec_ld(j, usrc);
523 vector unsigned char v_v = vec_ld(j, vsrc);
524 vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
525 vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
526 vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
527 vec_st(v_uyvy_0, (i << 1), dst);
528 vec_st(v_uyvy_1, (i << 1) + 16, dst);
529 }
530 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
531 usrc += chromStride;
532 vsrc += chromStride;
533 }
534 ysrc += lumStride;
535 dst += dstStride;
536 }
537 return srcSliceH;
538}
diff --git a/src/plugins/ffmpeg/libswscale/swscale_avoption.c b/src/plugins/ffmpeg/libswscale/swscale_avoption.c
deleted file mode 100644
index 1878b4e..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale_avoption.c
+++ /dev/null
@@ -1,59 +0,0 @@
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/avutil.h"
22#include "libavcodec/opt.h"
23#include "swscale.h"
24#include "swscale_internal.h"
25
26static const char * sws_context_to_name(void * ptr) {
27 return "swscaler";
28}
29
30#define OFFSET(x) offsetof(SwsContext, x)
31#define DEFAULT 0
32#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
33
34static const AVOption options[] = {
35 { "sws_flags", "scaler/cpu flags", OFFSET(flags), FF_OPT_TYPE_FLAGS, DEFAULT, 0, UINT_MAX, VE, "sws_flags" },
36 { "fast_bilinear", "fast bilinear", 0, FF_OPT_TYPE_CONST, SWS_FAST_BILINEAR, INT_MIN, INT_MAX, VE, "sws_flags" },
37 { "bilinear", "bilinear", 0, FF_OPT_TYPE_CONST, SWS_BILINEAR, INT_MIN, INT_MAX, VE, "sws_flags" },
38 { "bicubic", "bicubic", 0, FF_OPT_TYPE_CONST, SWS_BICUBIC, INT_MIN, INT_MAX, VE, "sws_flags" },
39 { "experimental", "experimental", 0, FF_OPT_TYPE_CONST, SWS_X, INT_MIN, INT_MAX, VE, "sws_flags" },
40 { "neighbor", "nearest neighbor", 0, FF_OPT_TYPE_CONST, SWS_POINT, INT_MIN, INT_MAX, VE, "sws_flags" },
41 { "area", "averaging area", 0, FF_OPT_TYPE_CONST, SWS_AREA, INT_MIN, INT_MAX, VE, "sws_flags" },
42 { "bicublin", "luma bicubic, chroma bilinear", 0, FF_OPT_TYPE_CONST, SWS_BICUBLIN, INT_MIN, INT_MAX, VE, "sws_flags" },
43 { "gauss", "gaussian", 0, FF_OPT_TYPE_CONST, SWS_GAUSS, INT_MIN, INT_MAX, VE, "sws_flags" },
44 { "sinc", "sinc", 0, FF_OPT_TYPE_CONST, SWS_SINC, INT_MIN, INT_MAX, VE, "sws_flags" },
45 { "lanczos", "lanczos", 0, FF_OPT_TYPE_CONST, SWS_LANCZOS, INT_MIN, INT_MAX, VE, "sws_flags" },
46 { "spline", "natural bicubic spline", 0, FF_OPT_TYPE_CONST, SWS_SPLINE, INT_MIN, INT_MAX, VE, "sws_flags" },
47 { "print_info", "print info", 0, FF_OPT_TYPE_CONST, SWS_PRINT_INFO, INT_MIN, INT_MAX, VE, "sws_flags" },
48 { "accurate_rnd", "accurate rounding", 0, FF_OPT_TYPE_CONST, SWS_ACCURATE_RND, INT_MIN, INT_MAX, VE, "sws_flags" },
49 { "mmx", "MMX SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_MMX, INT_MIN, INT_MAX, VE, "sws_flags" },
50 { "mmx2", "MMX2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_MMX2, INT_MIN, INT_MAX, VE, "sws_flags" },
51 { "3dnow", "3DNOW SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_3DNOW, INT_MIN, INT_MAX, VE, "sws_flags" },
52 { "altivec", "AltiVec SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_ALTIVEC, INT_MIN, INT_MAX, VE, "sws_flags" },
53 { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_BFIN, INT_MIN, INT_MAX, VE, "sws_flags" },
54 { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INT, INT_MIN, INT_MAX, VE, "sws_flags" },
55 { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INP, INT_MIN, INT_MAX, VE, "sws_flags" },
56 { NULL }
57};
58
59const AVClass sws_context_class = { "SWScaler", sws_context_to_name, options };
diff --git a/src/plugins/ffmpeg/libswscale/swscale_bfin.c b/src/plugins/ffmpeg/libswscale/swscale_bfin.c
deleted file mode 100644
index 3e63bbd..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale_bfin.c
+++ /dev/null
@@ -1,94 +0,0 @@
1/*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 *
4 * Blackfin software video scaler operations
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <inttypes.h>
27#include <assert.h>
28#include "config.h"
29#ifdef HAVE_MALLOC_H
30#include <malloc.h>
31#endif
32#include <unistd.h>
33#include "rgb2rgb.h"
34#include "swscale.h"
35#include "swscale_internal.h"
36
37#ifdef __FDPIC__
38#define L1CODE __attribute__ ((l1_text))
39#else
40#define L1CODE
41#endif
42
43extern int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
44 long width, long height,
45 long lumStride, long chromStride, long srcStride) L1CODE;
46
47extern int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
48 long width, long height,
49 long lumStride, long chromStride, long srcStride) L1CODE;
50
51static int uyvytoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
52 int srcSliceH, uint8_t* dst[], int dstStride[])
53{
54 uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
55 uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
56 uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
57 uint8_t *ip = src[0] + srcStride[0]*srcSliceY;
58 int w = dstStride[0];
59
60 ff_bfin_uyvytoyv12 (ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
61
62 return srcSliceH;
63}
64
65static int yuyvtoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
66 int srcSliceH, uint8_t* dst[], int dstStride[])
67{
68 uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY;
69 uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2;
70 uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2;
71 uint8_t *ip = src[0] + srcStride[0]*srcSliceY;
72 int w = dstStride[0];
73
74 ff_bfin_yuyvtoyv12 (ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
75
76 return srcSliceH;
77}
78
79
80void ff_bfin_get_unscaled_swscale (SwsContext *c)
81{
82 SwsFunc swScale = c->swScale;
83 if (c->flags & SWS_CPU_CAPS_BFIN)
84 if (c->dstFormat == PIX_FMT_YUV420P)
85 if (c->srcFormat == PIX_FMT_UYVY422) {
86 av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
87 c->swScale = uyvytoyv12_unscaled;
88 }
89 if (c->dstFormat == PIX_FMT_YUV420P)
90 if (c->srcFormat == PIX_FMT_YUYV422) {
91 av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
92 c->swScale = yuyvtoyv12_unscaled;
93 }
94}
diff --git a/src/plugins/ffmpeg/libswscale/swscale_internal.h b/src/plugins/ffmpeg/libswscale/swscale_internal.h
deleted file mode 100644
index 14c3a04..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale_internal.h
+++ /dev/null
@@ -1,283 +0,0 @@
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#ifndef FFMPEG_SWSCALE_INTERNAL_H
22#define FFMPEG_SWSCALE_INTERNAL_H
23
24#include "config.h"
25
26#ifdef HAVE_ALTIVEC_H
27#include <altivec.h>
28#endif
29
30#include "libavutil/avutil.h"
31
32#define MAX_FILTER_SIZE 256
33
34#define VOFW 8192
35#define VOF (VOFW*2)
36
37typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
38 int srcSliceH, uint8_t* dst[], int dstStride[]);
39
40/* This struct should be aligned on at least a 32-byte boundary. */
41typedef struct SwsContext{
42 /**
43 * info on struct for av_log
44 */
45 const AVClass *av_class;
46
47 /**
48 * Note that src, dst, srcStride, dstStride will be copied in the
49 * sws_scale() wrapper so they can be freely modified here.
50 */
51 SwsFunc swScale;
52 int srcW, srcH, dstH;
53 int chrSrcW, chrSrcH, chrDstW, chrDstH;
54 int lumXInc, chrXInc;
55 int lumYInc, chrYInc;
56 int dstFormat, srcFormat; ///< format 4:2:0 type is always YV12
57 int origDstFormat, origSrcFormat; ///< format
58 int chrSrcHSubSample, chrSrcVSubSample;
59 int chrIntHSubSample, chrIntVSubSample;
60 int chrDstHSubSample, chrDstVSubSample;
61 int vChrDrop;
62 int sliceDir;
63 double param[2];
64
65 int16_t **lumPixBuf;
66 int16_t **chrPixBuf;
67 int16_t *hLumFilter;
68 int16_t *hLumFilterPos;
69 int16_t *hChrFilter;
70 int16_t *hChrFilterPos;
71 int16_t *vLumFilter;
72 int16_t *vLumFilterPos;
73 int16_t *vChrFilter;
74 int16_t *vChrFilterPos;
75
76 uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful
77
78 int hLumFilterSize;
79 int hChrFilterSize;
80 int vLumFilterSize;
81 int vChrFilterSize;
82 int vLumBufSize;
83 int vChrBufSize;
84
85 uint8_t *funnyYCode;
86 uint8_t *funnyUVCode;
87 int32_t *lumMmx2FilterPos;
88 int32_t *chrMmx2FilterPos;
89 int16_t *lumMmx2Filter;
90 int16_t *chrMmx2Filter;
91
92 int canMMX2BeUsed;
93
94 int lastInLumBuf;
95 int lastInChrBuf;
96 int lumBufIndex;
97 int chrBufIndex;
98 int dstY;
99 int flags;
100 void * yuvTable; // pointer to the yuv->rgb table start so it can be freed()
101 uint8_t * table_rV[256];
102 uint8_t * table_gU[256];
103 int table_gV[256];
104 uint8_t * table_bU[256];
105
106 //Colorspace stuff
107 int contrast, brightness, saturation; // for sws_getColorspaceDetails
108 int srcColorspaceTable[4];
109 int dstColorspaceTable[4];
110 int srcRange, dstRange;
111
112#define RED_DITHER "0*8"
113#define GREEN_DITHER "1*8"
114#define BLUE_DITHER "2*8"
115#define Y_COEFF "3*8"
116#define VR_COEFF "4*8"
117#define UB_COEFF "5*8"
118#define VG_COEFF "6*8"
119#define UG_COEFF "7*8"
120#define Y_OFFSET "8*8"
121#define U_OFFSET "9*8"
122#define V_OFFSET "10*8"
123#define LUM_MMX_FILTER_OFFSET "11*8"
124#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
125#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
126#define ESP_OFFSET "11*8+4*4*256*2+8"
127#define VROUNDER_OFFSET "11*8+4*4*256*2+16"
128#define U_TEMP "11*8+4*4*256*2+24"
129#define V_TEMP "11*8+4*4*256*2+32"
130
131 uint64_t redDither __attribute__((aligned(8)));
132 uint64_t greenDither __attribute__((aligned(8)));
133 uint64_t blueDither __attribute__((aligned(8)));
134
135 uint64_t yCoeff __attribute__((aligned(8)));
136 uint64_t vrCoeff __attribute__((aligned(8)));
137 uint64_t ubCoeff __attribute__((aligned(8)));
138 uint64_t vgCoeff __attribute__((aligned(8)));
139 uint64_t ugCoeff __attribute__((aligned(8)));
140 uint64_t yOffset __attribute__((aligned(8)));
141 uint64_t uOffset __attribute__((aligned(8)));
142 uint64_t vOffset __attribute__((aligned(8)));
143 int32_t lumMmxFilter[4*MAX_FILTER_SIZE];
144 int32_t chrMmxFilter[4*MAX_FILTER_SIZE];
145 int dstW;
146 uint64_t esp __attribute__((aligned(8)));
147 uint64_t vRounder __attribute__((aligned(8)));
148 uint64_t u_temp __attribute__((aligned(8)));
149 uint64_t v_temp __attribute__((aligned(8)));
150
151#ifdef HAVE_ALTIVEC
152
153 vector signed short CY;
154 vector signed short CRV;
155 vector signed short CBU;
156 vector signed short CGU;
157 vector signed short CGV;
158 vector signed short OY;
159 vector unsigned short CSHIFT;
160 vector signed short *vYCoeffsBank, *vCCoeffsBank;
161
162#endif
163
164
165#ifdef ARCH_BFIN
166 uint32_t oy __attribute__((aligned(4)));
167 uint32_t oc __attribute__((aligned(4)));
168 uint32_t zero __attribute__((aligned(4)));
169 uint32_t cy __attribute__((aligned(4)));
170 uint32_t crv __attribute__((aligned(4)));
171 uint32_t rmask __attribute__((aligned(4)));
172 uint32_t cbu __attribute__((aligned(4)));
173 uint32_t bmask __attribute__((aligned(4)));
174 uint32_t cgu __attribute__((aligned(4)));
175 uint32_t cgv __attribute__((aligned(4)));
176 uint32_t gmask __attribute__((aligned(4)));
177#endif
178
179#ifdef HAVE_VIS
180 uint64_t sparc_coeffs[10] __attribute__((aligned(8)));
181#endif
182
183} SwsContext;
184//FIXME check init (where 0)
185
186SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
187int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
188
189void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation);
190SwsFunc yuv2rgb_init_altivec (SwsContext *c);
191void altivec_yuv2packedX (SwsContext *c,
192 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
193 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
194 uint8_t *dest, int dstW, int dstY);
195
196const char *sws_format_name(int format);
197
198//FIXME replace this with something faster
199#define isPlanarYUV(x) ( \
200 (x)==PIX_FMT_YUV410P \
201 || (x)==PIX_FMT_YUV420P \
202 || (x)==PIX_FMT_YUV411P \
203 || (x)==PIX_FMT_YUV422P \
204 || (x)==PIX_FMT_YUV444P \
205 || (x)==PIX_FMT_YUV440P \
206 || (x)==PIX_FMT_NV12 \
207 || (x)==PIX_FMT_NV21 \
208 )
209#define isYUV(x) ( \
210 (x)==PIX_FMT_UYVY422 \
211 || (x)==PIX_FMT_YUYV422 \
212 || isPlanarYUV(x) \
213 )
214#define isGray(x) ( \
215 (x)==PIX_FMT_GRAY8 \
216 || (x)==PIX_FMT_GRAY16BE \
217 || (x)==PIX_FMT_GRAY16LE \
218 )
219#define isGray16(x) ( \
220 (x)==PIX_FMT_GRAY16BE \
221 || (x)==PIX_FMT_GRAY16LE \
222 )
223#define isRGB(x) ( \
224 (x)==PIX_FMT_BGR32 \
225 || (x)==PIX_FMT_RGB24 \
226 || (x)==PIX_FMT_RGB565 \
227 || (x)==PIX_FMT_RGB555 \
228 || (x)==PIX_FMT_RGB8 \
229 || (x)==PIX_FMT_RGB4 \
230 || (x)==PIX_FMT_RGB4_BYTE \
231 || (x)==PIX_FMT_MONOBLACK \
232 )
233#define isBGR(x) ( \
234 (x)==PIX_FMT_RGB32 \
235 || (x)==PIX_FMT_BGR24 \
236 || (x)==PIX_FMT_BGR565 \
237 || (x)==PIX_FMT_BGR555 \
238 || (x)==PIX_FMT_BGR8 \
239 || (x)==PIX_FMT_BGR4 \
240 || (x)==PIX_FMT_BGR4_BYTE \
241 || (x)==PIX_FMT_MONOBLACK \
242 )
243
244static inline int fmt_depth(int fmt)
245{
246 switch(fmt) {
247 case PIX_FMT_BGRA:
248 case PIX_FMT_ABGR:
249 case PIX_FMT_RGBA:
250 case PIX_FMT_ARGB:
251 return 32;
252 case PIX_FMT_BGR24:
253 case PIX_FMT_RGB24:
254 return 24;
255 case PIX_FMT_BGR565:
256 case PIX_FMT_RGB565:
257 case PIX_FMT_GRAY16BE:
258 case PIX_FMT_GRAY16LE:
259 return 16;
260 case PIX_FMT_BGR555:
261 case PIX_FMT_RGB555:
262 return 15;
263 case PIX_FMT_BGR8:
264 case PIX_FMT_RGB8:
265 return 8;
266 case PIX_FMT_BGR4:
267 case PIX_FMT_RGB4:
268 case PIX_FMT_BGR4_BYTE:
269 case PIX_FMT_RGB4_BYTE:
270 return 4;
271 case PIX_FMT_MONOBLACK:
272 return 1;
273 default:
274 return 0;
275 }
276}
277
278extern const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]);
279extern const DECLARE_ALIGNED(8, uint64_t, ff_dither8[2]);
280
281extern const AVClass sws_context_class;
282
283#endif /* FFMPEG_SWSCALE_INTERNAL_H */
diff --git a/src/plugins/ffmpeg/libswscale/swscale_template.c b/src/plugins/ffmpeg/libswscale/swscale_template.c
deleted file mode 100644
index 1280ba6..0000000
--- a/src/plugins/ffmpeg/libswscale/swscale_template.c
+++ /dev/null
@@ -1,3295 +0,0 @@
1/*
2 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * The C code (not assembly, MMX, ...) of this file can be used
21 * under the LGPL license.
22 */
23
24#undef REAL_MOVNTQ
25#undef MOVNTQ
26#undef PAVGB
27#undef PREFETCH
28#undef PREFETCHW
29#undef EMMS
30#undef SFENCE
31
32#ifdef HAVE_3DNOW
33/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34#define EMMS "femms"
35#else
36#define EMMS "emms"
37#endif
38
39#ifdef HAVE_3DNOW
40#define PREFETCH "prefetch"
41#define PREFETCHW "prefetchw"
42#elif defined (HAVE_MMX2)
43#define PREFETCH "prefetchnta"
44#define PREFETCHW "prefetcht0"
45#else
46#define PREFETCH " # nop"
47#define PREFETCHW " # nop"
48#endif
49
50#ifdef HAVE_MMX2
51#define SFENCE "sfence"
52#else
53#define SFENCE " # nop"
54#endif
55
56#ifdef HAVE_MMX2
57#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58#elif defined (HAVE_3DNOW)
59#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60#endif
61
62#ifdef HAVE_MMX2
63#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64#else
65#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66#endif
67#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
68
69#ifdef HAVE_ALTIVEC
70#include "swscale_altivec_template.c"
71#endif
72
73#define YSCALEYUV2YV12X(x, offset, dest, width) \
74 asm volatile(\
75 "xor %%"REG_a", %%"REG_a" \n\t"\
76 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
77 "movq %%mm3, %%mm4 \n\t"\
78 "lea " offset "(%0), %%"REG_d" \n\t"\
79 "mov (%%"REG_d"), %%"REG_S" \n\t"\
80 ASMALIGN(4) /* FIXME Unroll? */\
81 "1: \n\t"\
82 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
83 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
84 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\
85 "add $16, %%"REG_d" \n\t"\
86 "mov (%%"REG_d"), %%"REG_S" \n\t"\
87 "test %%"REG_S", %%"REG_S" \n\t"\
88 "pmulhw %%mm0, %%mm2 \n\t"\
89 "pmulhw %%mm0, %%mm5 \n\t"\
90 "paddw %%mm2, %%mm3 \n\t"\
91 "paddw %%mm5, %%mm4 \n\t"\
92 " jnz 1b \n\t"\
93 "psraw $3, %%mm3 \n\t"\
94 "psraw $3, %%mm4 \n\t"\
95 "packuswb %%mm4, %%mm3 \n\t"\
96 MOVNTQ(%%mm3, (%1, %%REGa))\
97 "add $8, %%"REG_a" \n\t"\
98 "cmp %2, %%"REG_a" \n\t"\
99 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
100 "movq %%mm3, %%mm4 \n\t"\
101 "lea " offset "(%0), %%"REG_d" \n\t"\
102 "mov (%%"REG_d"), %%"REG_S" \n\t"\
103 "jb 1b \n\t"\
104 :: "r" (&c->redDither),\
105 "r" (dest), "g" (width)\
106 : "%"REG_a, "%"REG_d, "%"REG_S\
107 );
108
109#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110 asm volatile(\
111 "lea " offset "(%0), %%"REG_d" \n\t"\
112 "xor %%"REG_a", %%"REG_a" \n\t"\
113 "pxor %%mm4, %%mm4 \n\t"\
114 "pxor %%mm5, %%mm5 \n\t"\
115 "pxor %%mm6, %%mm6 \n\t"\
116 "pxor %%mm7, %%mm7 \n\t"\
117 "mov (%%"REG_d"), %%"REG_S" \n\t"\
118 ASMALIGN(4) \
119 "1: \n\t"\
120 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\
121 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\
122 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
123 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\
124 "movq %%mm0, %%mm3 \n\t"\
125 "punpcklwd %%mm1, %%mm0 \n\t"\
126 "punpckhwd %%mm1, %%mm3 \n\t"\
127 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "pmaddwd %%mm1, %%mm3 \n\t"\
130 "paddd %%mm0, %%mm4 \n\t"\
131 "paddd %%mm3, %%mm5 \n\t"\
132 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\
133 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
134 "add $16, %%"REG_d" \n\t"\
135 "test %%"REG_S", %%"REG_S" \n\t"\
136 "movq %%mm2, %%mm0 \n\t"\
137 "punpcklwd %%mm3, %%mm2 \n\t"\
138 "punpckhwd %%mm3, %%mm0 \n\t"\
139 "pmaddwd %%mm1, %%mm2 \n\t"\
140 "pmaddwd %%mm1, %%mm0 \n\t"\
141 "paddd %%mm2, %%mm6 \n\t"\
142 "paddd %%mm0, %%mm7 \n\t"\
143 " jnz 1b \n\t"\
144 "psrad $16, %%mm4 \n\t"\
145 "psrad $16, %%mm5 \n\t"\
146 "psrad $16, %%mm6 \n\t"\
147 "psrad $16, %%mm7 \n\t"\
148 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
149 "packssdw %%mm5, %%mm4 \n\t"\
150 "packssdw %%mm7, %%mm6 \n\t"\
151 "paddw %%mm0, %%mm4 \n\t"\
152 "paddw %%mm0, %%mm6 \n\t"\
153 "psraw $3, %%mm4 \n\t"\
154 "psraw $3, %%mm6 \n\t"\
155 "packuswb %%mm6, %%mm4 \n\t"\
156 MOVNTQ(%%mm4, (%1, %%REGa))\
157 "add $8, %%"REG_a" \n\t"\
158 "cmp %2, %%"REG_a" \n\t"\
159 "lea " offset "(%0), %%"REG_d" \n\t"\
160 "pxor %%mm4, %%mm4 \n\t"\
161 "pxor %%mm5, %%mm5 \n\t"\
162 "pxor %%mm6, %%mm6 \n\t"\
163 "pxor %%mm7, %%mm7 \n\t"\
164 "mov (%%"REG_d"), %%"REG_S" \n\t"\
165 "jb 1b \n\t"\
166 :: "r" (&c->redDither),\
167 "r" (dest), "g" (width)\
168 : "%"REG_a, "%"REG_d, "%"REG_S\
169 );
170
171#define YSCALEYUV2YV121 \
172 "mov %2, %%"REG_a" \n\t"\
173 ASMALIGN(4) /* FIXME Unroll? */\
174 "1: \n\t"\
175 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
176 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
177 "psraw $7, %%mm0 \n\t"\
178 "psraw $7, %%mm1 \n\t"\
179 "packuswb %%mm1, %%mm0 \n\t"\
180 MOVNTQ(%%mm0, (%1, %%REGa))\
181 "add $8, %%"REG_a" \n\t"\
182 "jnc 1b \n\t"
183
184#define YSCALEYUV2YV121_ACCURATE \
185 "mov %2, %%"REG_a" \n\t"\
186 "pcmpeqw %%mm7, %%mm7 \n\t"\
187 "psrlw $15, %%mm7 \n\t"\
188 "psllw $6, %%mm7 \n\t"\
189 ASMALIGN(4) /* FIXME Unroll? */\
190 "1: \n\t"\
191 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
192 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
193 "paddw %%mm7, %%mm0 \n\t"\
194 "paddw %%mm7, %%mm1 \n\t"\
195 "psraw $7, %%mm0 \n\t"\
196 "psraw $7, %%mm1 \n\t"\
197 "packuswb %%mm1, %%mm0 \n\t"\
198 MOVNTQ(%%mm0, (%1, %%REGa))\
199 "add $8, %%"REG_a" \n\t"\
200 "jnc 1b \n\t"
201
202/*
203 :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204 "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205 "r" (dest), "m" (dstW),
206 "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207 : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208*/
209#define YSCALEYUV2PACKEDX \
210 asm volatile(\
211 "xor %%"REG_a", %%"REG_a" \n\t"\
212 ASMALIGN(4)\
213 "nop \n\t"\
214 "1: \n\t"\
215 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
216 "mov (%%"REG_d"), %%"REG_S" \n\t"\
217 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
218 "movq %%mm3, %%mm4 \n\t"\
219 ASMALIGN(4)\
220 "2: \n\t"\
221 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
222 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
223 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
224 "add $16, %%"REG_d" \n\t"\
225 "mov (%%"REG_d"), %%"REG_S" \n\t"\
226 "pmulhw %%mm0, %%mm2 \n\t"\
227 "pmulhw %%mm0, %%mm5 \n\t"\
228 "paddw %%mm2, %%mm3 \n\t"\
229 "paddw %%mm5, %%mm4 \n\t"\
230 "test %%"REG_S", %%"REG_S" \n\t"\
231 " jnz 2b \n\t"\
232\
233 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
234 "mov (%%"REG_d"), %%"REG_S" \n\t"\
235 "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\
236 "movq %%mm1, %%mm7 \n\t"\
237 ASMALIGN(4)\
238 "2: \n\t"\
239 "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
240 "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
241 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\
242 "add $16, %%"REG_d" \n\t"\
243 "mov (%%"REG_d"), %%"REG_S" \n\t"\
244 "pmulhw %%mm0, %%mm2 \n\t"\
245 "pmulhw %%mm0, %%mm5 \n\t"\
246 "paddw %%mm2, %%mm1 \n\t"\
247 "paddw %%mm5, %%mm7 \n\t"\
248 "test %%"REG_S", %%"REG_S" \n\t"\
249 " jnz 2b \n\t"\
250
251#define YSCALEYUV2PACKEDX_END \
252 :: "r" (&c->redDither), \
253 "m" (dummy), "m" (dummy), "m" (dummy),\
254 "r" (dest), "m" (dstW) \
255 : "%"REG_a, "%"REG_d, "%"REG_S \
256 );
257
258#define YSCALEYUV2PACKEDX_ACCURATE \
259 asm volatile(\
260 "xor %%"REG_a", %%"REG_a" \n\t"\
261 ASMALIGN(4)\
262 "nop \n\t"\
263 "1: \n\t"\
264 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
265 "mov (%%"REG_d"), %%"REG_S" \n\t"\
266 "pxor %%mm4, %%mm4 \n\t"\
267 "pxor %%mm5, %%mm5 \n\t"\
268 "pxor %%mm6, %%mm6 \n\t"\
269 "pxor %%mm7, %%mm7 \n\t"\
270 ASMALIGN(4)\
271 "2: \n\t"\
272 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
273 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
274 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
275 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
276 "movq %%mm0, %%mm3 \n\t"\
277 "punpcklwd %%mm1, %%mm0 \n\t"\
278 "punpckhwd %%mm1, %%mm3 \n\t"\
279 "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\
280 "pmaddwd %%mm1, %%mm0 \n\t"\
281 "pmaddwd %%mm1, %%mm3 \n\t"\
282 "paddd %%mm0, %%mm4 \n\t"\
283 "paddd %%mm3, %%mm5 \n\t"\
284 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
285 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
286 "add $16, %%"REG_d" \n\t"\
287 "test %%"REG_S", %%"REG_S" \n\t"\
288 "movq %%mm2, %%mm0 \n\t"\
289 "punpcklwd %%mm3, %%mm2 \n\t"\
290 "punpckhwd %%mm3, %%mm0 \n\t"\
291 "pmaddwd %%mm1, %%mm2 \n\t"\
292 "pmaddwd %%mm1, %%mm0 \n\t"\
293 "paddd %%mm2, %%mm6 \n\t"\
294 "paddd %%mm0, %%mm7 \n\t"\
295 " jnz 2b \n\t"\
296 "psrad $16, %%mm4 \n\t"\
297 "psrad $16, %%mm5 \n\t"\
298 "psrad $16, %%mm6 \n\t"\
299 "psrad $16, %%mm7 \n\t"\
300 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
301 "packssdw %%mm5, %%mm4 \n\t"\
302 "packssdw %%mm7, %%mm6 \n\t"\
303 "paddw %%mm0, %%mm4 \n\t"\
304 "paddw %%mm0, %%mm6 \n\t"\
305 "movq %%mm4, "U_TEMP"(%0) \n\t"\
306 "movq %%mm6, "V_TEMP"(%0) \n\t"\
307\
308 "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
309 "mov (%%"REG_d"), %%"REG_S" \n\t"\
310 "pxor %%mm1, %%mm1 \n\t"\
311 "pxor %%mm5, %%mm5 \n\t"\
312 "pxor %%mm7, %%mm7 \n\t"\
313 "pxor %%mm6, %%mm6 \n\t"\
314 ASMALIGN(4)\
315 "2: \n\t"\
316 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
317 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
318 "mov 4(%%"REG_d"), %%"REG_S" \n\t"\
319 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
320 "movq %%mm0, %%mm3 \n\t"\
321 "punpcklwd %%mm4, %%mm0 \n\t"\
322 "punpckhwd %%mm4, %%mm3 \n\t"\
323 "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
324 "pmaddwd %%mm4, %%mm0 \n\t"\
325 "pmaddwd %%mm4, %%mm3 \n\t"\
326 "paddd %%mm0, %%mm1 \n\t"\
327 "paddd %%mm3, %%mm5 \n\t"\
328 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
329 "mov 16(%%"REG_d"), %%"REG_S" \n\t"\
330 "add $16, %%"REG_d" \n\t"\
331 "test %%"REG_S", %%"REG_S" \n\t"\
332 "movq %%mm2, %%mm0 \n\t"\
333 "punpcklwd %%mm3, %%mm2 \n\t"\
334 "punpckhwd %%mm3, %%mm0 \n\t"\
335 "pmaddwd %%mm4, %%mm2 \n\t"\
336 "pmaddwd %%mm4, %%mm0 \n\t"\
337 "paddd %%mm2, %%mm7 \n\t"\
338 "paddd %%mm0, %%mm6 \n\t"\
339 " jnz 2b \n\t"\
340 "psrad $16, %%mm1 \n\t"\
341 "psrad $16, %%mm5 \n\t"\
342 "psrad $16, %%mm7 \n\t"\
343 "psrad $16, %%mm6 \n\t"\
344 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
345 "packssdw %%mm5, %%mm1 \n\t"\
346 "packssdw %%mm6, %%mm7 \n\t"\
347 "paddw %%mm0, %%mm1 \n\t"\
348 "paddw %%mm0, %%mm7 \n\t"\
349 "movq "U_TEMP"(%0), %%mm3 \n\t"\
350 "movq "V_TEMP"(%0), %%mm4 \n\t"\
351
352#define YSCALEYUV2RGBX \
353 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
354 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
355 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
356 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
357 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
358 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
359/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
361 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
362 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
363 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
364 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
365 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
366/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367 "paddw %%mm3, %%mm4 \n\t"\
368 "movq %%mm2, %%mm0 \n\t"\
369 "movq %%mm5, %%mm6 \n\t"\
370 "movq %%mm4, %%mm3 \n\t"\
371 "punpcklwd %%mm2, %%mm2 \n\t"\
372 "punpcklwd %%mm5, %%mm5 \n\t"\
373 "punpcklwd %%mm4, %%mm4 \n\t"\
374 "paddw %%mm1, %%mm2 \n\t"\
375 "paddw %%mm1, %%mm5 \n\t"\
376 "paddw %%mm1, %%mm4 \n\t"\
377 "punpckhwd %%mm0, %%mm0 \n\t"\
378 "punpckhwd %%mm6, %%mm6 \n\t"\
379 "punpckhwd %%mm3, %%mm3 \n\t"\
380 "paddw %%mm7, %%mm0 \n\t"\
381 "paddw %%mm7, %%mm6 \n\t"\
382 "paddw %%mm7, %%mm3 \n\t"\
383 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384 "packuswb %%mm0, %%mm2 \n\t"\
385 "packuswb %%mm6, %%mm5 \n\t"\
386 "packuswb %%mm3, %%mm4 \n\t"\
387 "pxor %%mm7, %%mm7 \n\t"
388#if 0
389#define FULL_YSCALEYUV2RGB \
390 "pxor %%mm7, %%mm7 \n\t"\
391 "movd %6, %%mm6 \n\t" /*yalpha1*/\
392 "punpcklwd %%mm6, %%mm6 \n\t"\
393 "punpcklwd %%mm6, %%mm6 \n\t"\
394 "movd %7, %%mm5 \n\t" /*uvalpha1*/\
395 "punpcklwd %%mm5, %%mm5 \n\t"\
396 "punpcklwd %%mm5, %%mm5 \n\t"\
397 "xor %%"REG_a", %%"REG_a" \n\t"\
398 ASMALIGN(4)\
399 "1: \n\t"\
400 "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\
401 "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\
402 "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\
403 "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\
404 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
405 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406 "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407 "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409 "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
410 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412 "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\
413 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414 "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415 "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\
416 "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\
417 "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\
418\
419\
420 "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
422 "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\
423 "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424 "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\
425 "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426 "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\
427\
428\
429 "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\
430 "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\
431 "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\
432 "paddw %%mm1, %%mm3 \n\t" /* B*/\
433 "paddw %%mm1, %%mm0 \n\t" /* R*/\
434 "packuswb %%mm3, %%mm3 \n\t"\
435\
436 "packuswb %%mm0, %%mm0 \n\t"\
437 "paddw %%mm4, %%mm2 \n\t"\
438 "paddw %%mm2, %%mm1 \n\t" /* G*/\
439\
440 "packuswb %%mm1, %%mm1 \n\t"
441#endif
442
443#define REAL_YSCALEYUV2PACKED(index, c) \
444 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
445 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
446 "psraw $3, %%mm0 \n\t"\
447 "psraw $3, %%mm1 \n\t"\
448 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450 "xor "#index", "#index" \n\t"\
451 ASMALIGN(4)\
452 "1: \n\t"\
453 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
454 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
455 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
456 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
457 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
460 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
467 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
468 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
469 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
470 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
471 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
472 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478
479#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
480
481#define REAL_YSCALEYUV2RGB(index, c) \
482 "xor "#index", "#index" \n\t"\
483 ASMALIGN(4)\
484 "1: \n\t"\
485 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
486 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
487 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
488 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
489 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
492 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
499 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
500 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
501 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
502 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
503 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
504 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
506 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
507 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
508 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
509 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
510 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
511 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
518 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
519 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
520 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
521 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
522 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
523 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524 "paddw %%mm3, %%mm4 \n\t"\
525 "movq %%mm2, %%mm0 \n\t"\
526 "movq %%mm5, %%mm6 \n\t"\
527 "movq %%mm4, %%mm3 \n\t"\
528 "punpcklwd %%mm2, %%mm2 \n\t"\
529 "punpcklwd %%mm5, %%mm5 \n\t"\
530 "punpcklwd %%mm4, %%mm4 \n\t"\
531 "paddw %%mm1, %%mm2 \n\t"\
532 "paddw %%mm1, %%mm5 \n\t"\
533 "paddw %%mm1, %%mm4 \n\t"\
534 "punpckhwd %%mm0, %%mm0 \n\t"\
535 "punpckhwd %%mm6, %%mm6 \n\t"\
536 "punpckhwd %%mm3, %%mm3 \n\t"\
537 "paddw %%mm7, %%mm0 \n\t"\
538 "paddw %%mm7, %%mm6 \n\t"\
539 "paddw %%mm7, %%mm3 \n\t"\
540 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541 "packuswb %%mm0, %%mm2 \n\t"\
542 "packuswb %%mm6, %%mm5 \n\t"\
543 "packuswb %%mm3, %%mm4 \n\t"\
544 "pxor %%mm7, %%mm7 \n\t"
545#define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c)
546
547#define REAL_YSCALEYUV2PACKED1(index, c) \
548 "xor "#index", "#index" \n\t"\
549 ASMALIGN(4)\
550 "1: \n\t"\
551 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
552 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
553 "psraw $7, %%mm3 \n\t" \
554 "psraw $7, %%mm4 \n\t" \
555 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
556 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
557 "psraw $7, %%mm1 \n\t" \
558 "psraw $7, %%mm7 \n\t" \
559
560#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
561
562#define REAL_YSCALEYUV2RGB1(index, c) \
563 "xor "#index", "#index" \n\t"\
564 ASMALIGN(4)\
565 "1: \n\t"\
566 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
567 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
568 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
571 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
572 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
573 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
574 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
575 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
576 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
578 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
579 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
582 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
583 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
584 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
585 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
586 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
587 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588 "paddw %%mm3, %%mm4 \n\t"\
589 "movq %%mm2, %%mm0 \n\t"\
590 "movq %%mm5, %%mm6 \n\t"\
591 "movq %%mm4, %%mm3 \n\t"\
592 "punpcklwd %%mm2, %%mm2 \n\t"\
593 "punpcklwd %%mm5, %%mm5 \n\t"\
594 "punpcklwd %%mm4, %%mm4 \n\t"\
595 "paddw %%mm1, %%mm2 \n\t"\
596 "paddw %%mm1, %%mm5 \n\t"\
597 "paddw %%mm1, %%mm4 \n\t"\
598 "punpckhwd %%mm0, %%mm0 \n\t"\
599 "punpckhwd %%mm6, %%mm6 \n\t"\
600 "punpckhwd %%mm3, %%mm3 \n\t"\
601 "paddw %%mm7, %%mm0 \n\t"\
602 "paddw %%mm7, %%mm6 \n\t"\
603 "paddw %%mm7, %%mm3 \n\t"\
604 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605 "packuswb %%mm0, %%mm2 \n\t"\
606 "packuswb %%mm6, %%mm5 \n\t"\
607 "packuswb %%mm3, %%mm4 \n\t"\
608 "pxor %%mm7, %%mm7 \n\t"
609#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
610
611#define REAL_YSCALEYUV2PACKED1b(index, c) \
612 "xor "#index", "#index" \n\t"\
613 ASMALIGN(4)\
614 "1: \n\t"\
615 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
616 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
617 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
618 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
619 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621 "psrlw $8, %%mm3 \n\t" \
622 "psrlw $8, %%mm4 \n\t" \
623 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
624 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
625 "psraw $7, %%mm1 \n\t" \
626 "psraw $7, %%mm7 \n\t"
627#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
628
629// do vertical chrominance interpolation
630#define REAL_YSCALEYUV2RGB1b(index, c) \
631 "xor "#index", "#index" \n\t"\
632 ASMALIGN(4)\
633 "1: \n\t"\
634 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
635 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
636 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
637 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
638 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
641 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
642 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
643 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
644 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
645 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
646 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
647 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
648 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
650 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
651 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
654 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
655 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
656 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
657 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
658 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
659 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660 "paddw %%mm3, %%mm4 \n\t"\
661 "movq %%mm2, %%mm0 \n\t"\
662 "movq %%mm5, %%mm6 \n\t"\
663 "movq %%mm4, %%mm3 \n\t"\
664 "punpcklwd %%mm2, %%mm2 \n\t"\
665 "punpcklwd %%mm5, %%mm5 \n\t"\
666 "punpcklwd %%mm4, %%mm4 \n\t"\
667 "paddw %%mm1, %%mm2 \n\t"\
668 "paddw %%mm1, %%mm5 \n\t"\
669 "paddw %%mm1, %%mm4 \n\t"\
670 "punpckhwd %%mm0, %%mm0 \n\t"\
671 "punpckhwd %%mm6, %%mm6 \n\t"\
672 "punpckhwd %%mm3, %%mm3 \n\t"\
673 "paddw %%mm7, %%mm0 \n\t"\
674 "paddw %%mm7, %%mm6 \n\t"\
675 "paddw %%mm7, %%mm3 \n\t"\
676 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677 "packuswb %%mm0, %%mm2 \n\t"\
678 "packuswb %%mm6, %%mm5 \n\t"\
679 "packuswb %%mm3, %%mm4 \n\t"\
680 "pxor %%mm7, %%mm7 \n\t"
681#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
682
683#define REAL_WRITEBGR32(dst, dstw, index) \
684 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685 "movq %%mm2, %%mm1 \n\t" /* B */\
686 "movq %%mm5, %%mm6 \n\t" /* R */\
687 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
688 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
689 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
690 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
691 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
692 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
693 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
694 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
695 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
696 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
697\
698 MOVNTQ(%%mm0, (dst, index, 4))\
699 MOVNTQ(%%mm2, 8(dst, index, 4))\
700 MOVNTQ(%%mm1, 16(dst, index, 4))\
701 MOVNTQ(%%mm3, 24(dst, index, 4))\
702\
703 "add $8, "#index" \n\t"\
704 "cmp "#dstw", "#index" \n\t"\
705 " jb 1b \n\t"
706#define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index)
707
708#define REAL_WRITEBGR16(dst, dstw, index) \
709 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
710 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
711 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
712 "psrlq $3, %%mm2 \n\t"\
713\
714 "movq %%mm2, %%mm1 \n\t"\
715 "movq %%mm4, %%mm3 \n\t"\
716\
717 "punpcklbw %%mm7, %%mm3 \n\t"\
718 "punpcklbw %%mm5, %%mm2 \n\t"\
719 "punpckhbw %%mm7, %%mm4 \n\t"\
720 "punpckhbw %%mm5, %%mm1 \n\t"\
721\
722 "psllq $3, %%mm3 \n\t"\
723 "psllq $3, %%mm4 \n\t"\
724\
725 "por %%mm3, %%mm2 \n\t"\
726 "por %%mm4, %%mm1 \n\t"\
727\
728 MOVNTQ(%%mm2, (dst, index, 2))\
729 MOVNTQ(%%mm1, 8(dst, index, 2))\
730\
731 "add $8, "#index" \n\t"\
732 "cmp "#dstw", "#index" \n\t"\
733 " jb 1b \n\t"
734#define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index)
735
736#define REAL_WRITEBGR15(dst, dstw, index) \
737 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
738 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
739 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
740 "psrlq $3, %%mm2 \n\t"\
741 "psrlq $1, %%mm5 \n\t"\
742\
743 "movq %%mm2, %%mm1 \n\t"\
744 "movq %%mm4, %%mm3 \n\t"\
745\
746 "punpcklbw %%mm7, %%mm3 \n\t"\
747 "punpcklbw %%mm5, %%mm2 \n\t"\
748 "punpckhbw %%mm7, %%mm4 \n\t"\
749 "punpckhbw %%mm5, %%mm1 \n\t"\
750\
751 "psllq $2, %%mm3 \n\t"\
752 "psllq $2, %%mm4 \n\t"\
753\
754 "por %%mm3, %%mm2 \n\t"\
755 "por %%mm4, %%mm1 \n\t"\
756\
757 MOVNTQ(%%mm2, (dst, index, 2))\
758 MOVNTQ(%%mm1, 8(dst, index, 2))\
759\
760 "add $8, "#index" \n\t"\
761 "cmp "#dstw", "#index" \n\t"\
762 " jb 1b \n\t"
763#define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index)
764
765#define WRITEBGR24OLD(dst, dstw, index) \
766 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767 "movq %%mm2, %%mm1 \n\t" /* B */\
768 "movq %%mm5, %%mm6 \n\t" /* R */\
769 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
770 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
771 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
772 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
773 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
774 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
775 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
776 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
777 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
778 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
779\
780 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
781 "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\
782 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\
783 "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\
784 "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\
785 "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\
786 "psllq $48, %%mm2 \n\t" /* GB000000 1 */\
787 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
788\
789 "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\
790 "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\
791 "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\
792 "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\
793 "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\
794 "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\
795 "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\
796 "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\
797 "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\
798 "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\
799 "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\
800 "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\
801 "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\
802\
803 "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\
804 "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\
805 "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\
806 "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\
807 "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\
808 "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\
809 "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\
810 "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\
811\
812 MOVNTQ(%%mm0, (dst))\
813 MOVNTQ(%%mm2, 8(dst))\
814 MOVNTQ(%%mm3, 16(dst))\
815 "add $24, "#dst" \n\t"\
816\
817 "add $8, "#index" \n\t"\
818 "cmp "#dstw", "#index" \n\t"\
819 " jb 1b \n\t"
820
821#define WRITEBGR24MMX(dst, dstw, index) \
822 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823 "movq %%mm2, %%mm1 \n\t" /* B */\
824 "movq %%mm5, %%mm6 \n\t" /* R */\
825 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
826 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
827 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
828 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
829 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
830 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
831 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
832 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
833 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
834 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
835\
836 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
837 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
838 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
839 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
840\
841 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
842 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
843 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
844 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
845\
846 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
847 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
848 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
849 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
850\
851 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
852 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
853 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
854 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
855 MOVNTQ(%%mm0, (dst))\
856\
857 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
858 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
859 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
860 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
861 MOVNTQ(%%mm6, 8(dst))\
862\
863 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
864 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
865 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
866 MOVNTQ(%%mm5, 16(dst))\
867\
868 "add $24, "#dst" \n\t"\
869\
870 "add $8, "#index" \n\t"\
871 "cmp "#dstw", "#index" \n\t"\
872 " jb 1b \n\t"
873
874#define WRITEBGR24MMX2(dst, dstw, index) \
875 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
879 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
880 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
881\
882 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
883 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
884 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
885\
886 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
887 "por %%mm1, %%mm6 \n\t"\
888 "por %%mm3, %%mm6 \n\t"\
889 MOVNTQ(%%mm6, (dst))\
890\
891 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
892 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
893 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
894 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
895\
896 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
897 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
898 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
899\
900 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
901 "por %%mm3, %%mm6 \n\t"\
902 MOVNTQ(%%mm6, 8(dst))\
903\
904 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
905 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
906 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
907\
908 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
909 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
910 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
911\
912 "por %%mm1, %%mm3 \n\t"\
913 "por %%mm3, %%mm6 \n\t"\
914 MOVNTQ(%%mm6, 16(dst))\
915\
916 "add $24, "#dst" \n\t"\
917\
918 "add $8, "#index" \n\t"\
919 "cmp "#dstw", "#index" \n\t"\
920 " jb 1b \n\t"
921
922#ifdef HAVE_MMX2
923#undef WRITEBGR24
924#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
925#else
926#undef WRITEBGR24
927#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
928#endif
929
930#define REAL_WRITEYUY2(dst, dstw, index) \
931 "packuswb %%mm3, %%mm3 \n\t"\
932 "packuswb %%mm4, %%mm4 \n\t"\
933 "packuswb %%mm7, %%mm1 \n\t"\
934 "punpcklbw %%mm4, %%mm3 \n\t"\
935 "movq %%mm1, %%mm7 \n\t"\
936 "punpcklbw %%mm3, %%mm1 \n\t"\
937 "punpckhbw %%mm3, %%mm7 \n\t"\
938\
939 MOVNTQ(%%mm1, (dst, index, 2))\
940 MOVNTQ(%%mm7, 8(dst, index, 2))\
941\
942 "add $8, "#index" \n\t"\
943 "cmp "#dstw", "#index" \n\t"\
944 " jb 1b \n\t"
945#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
946
947
948static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951{
952#ifdef HAVE_MMX
953 if (c->flags & SWS_ACCURATE_RND){
954 if (uDest){
955 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
957 }
958
959 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
960 }else{
961 if (uDest){
962 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
964 }
965
966 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
967 }
968#else
969#ifdef HAVE_ALTIVEC
970yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971 chrFilter, chrSrc, chrFilterSize,
972 dest, uDest, vDest, dstW, chrDstW);
973#else //HAVE_ALTIVEC
974yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975 chrFilter, chrSrc, chrFilterSize,
976 dest, uDest, vDest, dstW, chrDstW);
977#endif //!HAVE_ALTIVEC
978#endif /* HAVE_MMX */
979}
980
981static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
984{
985yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986 chrFilter, chrSrc, chrFilterSize,
987 dest, uDest, dstW, chrDstW, dstFormat);
988}
989
990static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
992{
993#ifdef HAVE_MMX
994 long p= uDest ? 3 : 1;
995 uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996 uint8_t *dst[3]= {dest, uDest, vDest};
997 long counter[3] = {dstW, chrDstW, chrDstW};
998
999 if (c->flags & SWS_ACCURATE_RND){
1000 while(p--){
1001 asm volatile(
1002 YSCALEYUV2YV121_ACCURATE
1003 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1004 "g" (-counter[p])
1005 : "%"REG_a
1006 );
1007 }
1008 }else{
1009 while(p--){
1010 asm volatile(
1011 YSCALEYUV2YV121
1012 :: "r" (src[p]), "r" (dst[p] + counter[p]),
1013 "g" (-counter[p])
1014 : "%"REG_a
1015 );
1016 }
1017 }
1018
1019#else
1020 int i;
1021 for (i=0; i<dstW; i++)
1022 {
1023 int val= (lumSrc[i]+64)>>7;
1024
1025 if (val&256){
1026 if (val<0) val=0;
1027 else val=255;
1028 }
1029
1030 dest[i]= val;
1031 }
1032
1033 if (uDest)
1034 for (i=0; i<chrDstW; i++)
1035 {
1036 int u=(chrSrc[i ]+64)>>7;
1037 int v=(chrSrc[i + VOFW]+64)>>7;
1038
1039 if ((u|v)&256){
1040 if (u<0) u=0;
1041 else if (u>255) u=255;
1042 if (v<0) v=0;
1043 else if (v>255) v=255;
1044 }
1045
1046 uDest[i]= u;
1047 vDest[i]= v;
1048 }
1049#endif
1050}
1051
1052
1053/**
1054 * vertical scale YV12 to RGB
1055 */
1056static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058 uint8_t *dest, long dstW, long dstY)
1059{
1060#ifdef HAVE_MMX
1061 long dummy=0;
1062 if (c->flags & SWS_ACCURATE_RND){
1063 switch(c->dstFormat){
1064 case PIX_FMT_RGB32:
1065 YSCALEYUV2PACKEDX_ACCURATE
1066 YSCALEYUV2RGBX
1067 WRITEBGR32(%4, %5, %%REGa)
1068
1069 YSCALEYUV2PACKEDX_END
1070 return;
1071 case PIX_FMT_BGR24:
1072 YSCALEYUV2PACKEDX_ACCURATE
1073 YSCALEYUV2RGBX
1074 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075 "add %4, %%"REG_c" \n\t"
1076 WRITEBGR24(%%REGc, %5, %%REGa)
1077
1078
1079 :: "r" (&c->redDither),
1080 "m" (dummy), "m" (dummy), "m" (dummy),
1081 "r" (dest), "m" (dstW)
1082 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1083 );
1084 return;
1085 case PIX_FMT_BGR555:
1086 YSCALEYUV2PACKEDX_ACCURATE
1087 YSCALEYUV2RGBX
1088 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089#ifdef DITHER1XBPP
1090 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091 "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1093#endif
1094
1095 WRITEBGR15(%4, %5, %%REGa)
1096 YSCALEYUV2PACKEDX_END
1097 return;
1098 case PIX_FMT_BGR565:
1099 YSCALEYUV2PACKEDX_ACCURATE
1100 YSCALEYUV2RGBX
1101 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102#ifdef DITHER1XBPP
1103 "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104 "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105 "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1106#endif
1107
1108 WRITEBGR16(%4, %5, %%REGa)
1109 YSCALEYUV2PACKEDX_END
1110 return;
1111 case PIX_FMT_YUYV422:
1112 YSCALEYUV2PACKEDX_ACCURATE
1113 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114
1115 "psraw $3, %%mm3 \n\t"
1116 "psraw $3, %%mm4 \n\t"
1117 "psraw $3, %%mm1 \n\t"
1118 "psraw $3, %%mm7 \n\t"
1119 WRITEYUY2(%4, %5, %%REGa)
1120 YSCALEYUV2PACKEDX_END
1121 return;
1122 }
1123 }else{
1124 switch(c->dstFormat)
1125 {
1126 case PIX_FMT_RGB32:
1127 YSCALEYUV2PACKEDX
1128 YSCALEYUV2RGBX
1129 WRITEBGR32(%4, %5, %%REGa)
1130 YSCALEYUV2PACKEDX_END
1131 return;
1132 case PIX_FMT_BGR24:
1133 YSCALEYUV2PACKEDX
1134 YSCALEYUV2RGBX
1135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
1136 "add %4, %%"REG_c" \n\t"
1137 WRITEBGR24(%%REGc, %5, %%REGa)
1138
1139 :: "r" (&c->redDither),
1140 "m" (dummy), "m" (dummy), "m" (dummy),
1141 "r" (dest), "m" (dstW)
1142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143 );
1144 return;
1145 case PIX_FMT_BGR555:
1146 YSCALEYUV2PACKEDX
1147 YSCALEYUV2RGBX
1148 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149#ifdef DITHER1XBPP
1150 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1151 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1152 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1153#endif
1154
1155 WRITEBGR15(%4, %5, %%REGa)
1156 YSCALEYUV2PACKEDX_END
1157 return;
1158 case PIX_FMT_BGR565:
1159 YSCALEYUV2PACKEDX
1160 YSCALEYUV2RGBX
1161 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1162#ifdef DITHER1XBPP
1163 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1164 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1165 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1166#endif
1167
1168 WRITEBGR16(%4, %5, %%REGa)
1169 YSCALEYUV2PACKEDX_END
1170 return;
1171 case PIX_FMT_YUYV422:
1172 YSCALEYUV2PACKEDX
1173 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174
1175 "psraw $3, %%mm3 \n\t"
1176 "psraw $3, %%mm4 \n\t"
1177 "psraw $3, %%mm1 \n\t"
1178 "psraw $3, %%mm7 \n\t"
1179 WRITEYUY2(%4, %5, %%REGa)
1180 YSCALEYUV2PACKEDX_END
1181 return;
1182 }
1183 }
1184#endif /* HAVE_MMX */
1185#ifdef HAVE_ALTIVEC
1186 /* The following list of supported dstFormat values should
1187 match what's found in the body of altivec_yuv2packedX() */
1188 if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
1189 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)
1191 altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192 chrFilter, chrSrc, chrFilterSize,
1193 dest, dstW, dstY);
1194 else
1195#endif
1196 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197 chrFilter, chrSrc, chrFilterSize,
1198 dest, dstW, dstY);
1199}
1200
1201/**
1202 * vertical bilinear scale YV12 to RGB
1203 */
1204static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205 uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1206{
1207 int yalpha1=yalpha^4095;
1208 int uvalpha1=uvalpha^4095;
1209 int i;
1210
1211#if 0 //isn't used
1212 if (flags&SWS_FULL_CHR_H_INT)
1213 {
1214 switch(dstFormat)
1215 {
1216#ifdef HAVE_MMX
1217 case PIX_FMT_RGB32:
1218 asm volatile(
1219
1220
1221FULL_YSCALEYUV2RGB
1222 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1223 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1224
1225 "movq %%mm3, %%mm1 \n\t"
1226 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1227 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1228
1229 MOVNTQ(%%mm3, (%4, %%REGa, 4))
1230 MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1231
1232 "add $4, %%"REG_a" \n\t"
1233 "cmp %5, %%"REG_a" \n\t"
1234 " jb 1b \n\t"
1235
1236 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237 "m" (yalpha1), "m" (uvalpha1)
1238 : "%"REG_a
1239 );
1240 break;
1241 case PIX_FMT_BGR24:
1242 asm volatile(
1243
1244FULL_YSCALEYUV2RGB
1245
1246 // lsb ... msb
1247 "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG
1248 "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0
1249
1250 "movq %%mm3, %%mm1 \n\t"
1251 "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0
1252 "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0
1253
1254 "movq %%mm3, %%mm2 \n\t" // BGR0BGR0
1255 "psrlq $8, %%mm3 \n\t" // GR0BGR00
1256 "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000
1257 "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00
1258 "por %%mm2, %%mm3 \n\t" // BGRBGR00
1259 "movq %%mm1, %%mm2 \n\t"
1260 "psllq $48, %%mm1 \n\t" // 000000BG
1261 "por %%mm1, %%mm3 \n\t" // BGRBGRBG
1262
1263 "movq %%mm2, %%mm1 \n\t" // BGR0BGR0
1264 "psrld $16, %%mm2 \n\t" // R000R000
1265 "psrlq $24, %%mm1 \n\t" // 0BGR0000
1266 "por %%mm2, %%mm1 \n\t" // RBGRR000
1267
1268 "mov %4, %%"REG_b" \n\t"
1269 "add %%"REG_a", %%"REG_b" \n\t"
1270
1271#ifdef HAVE_MMX2
1272 //FIXME Alignment
1273 "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1274 "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1275#else
1276 "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t"
1277 "psrlq $32, %%mm3 \n\t"
1278 "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t"
1279 "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t"
1280#endif
1281 "add $4, %%"REG_a" \n\t"
1282 "cmp %5, %%"REG_a" \n\t"
1283 " jb 1b \n\t"
1284
1285 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286 "m" (yalpha1), "m" (uvalpha1)
1287 : "%"REG_a, "%"REG_b
1288 );
1289 break;
1290 case PIX_FMT_BGR555:
1291 asm volatile(
1292
1293FULL_YSCALEYUV2RGB
1294#ifdef DITHER1XBPP
1295 "paddusb "MANGLE(g5Dither)", %%mm1 \n\t"
1296 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1297 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1298#endif
1299 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1300 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1301 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1302
1303 "psrlw $3, %%mm3 \n\t"
1304 "psllw $2, %%mm1 \n\t"
1305 "psllw $7, %%mm0 \n\t"
1306 "pand "MANGLE(g15Mask)", %%mm1 \n\t"
1307 "pand "MANGLE(r15Mask)", %%mm0 \n\t"
1308
1309 "por %%mm3, %%mm1 \n\t"
1310 "por %%mm1, %%mm0 \n\t"
1311
1312 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1313
1314 "add $4, %%"REG_a" \n\t"
1315 "cmp %5, %%"REG_a" \n\t"
1316 " jb 1b \n\t"
1317
1318 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319 "m" (yalpha1), "m" (uvalpha1)
1320 : "%"REG_a
1321 );
1322 break;
1323 case PIX_FMT_BGR565:
1324 asm volatile(
1325
1326FULL_YSCALEYUV2RGB
1327#ifdef DITHER1XBPP
1328 "paddusb "MANGLE(g6Dither)", %%mm1 \n\t"
1329 "paddusb "MANGLE(r5Dither)", %%mm0 \n\t"
1330 "paddusb "MANGLE(b5Dither)", %%mm3 \n\t"
1331#endif
1332 "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G
1333 "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B
1334 "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R
1335
1336 "psrlw $3, %%mm3 \n\t"
1337 "psllw $3, %%mm1 \n\t"
1338 "psllw $8, %%mm0 \n\t"
1339 "pand "MANGLE(g16Mask)", %%mm1 \n\t"
1340 "pand "MANGLE(r16Mask)", %%mm0 \n\t"
1341
1342 "por %%mm3, %%mm1 \n\t"
1343 "por %%mm1, %%mm0 \n\t"
1344
1345 MOVNTQ(%%mm0, (%4, %%REGa, 2))
1346
1347 "add $4, %%"REG_a" \n\t"
1348 "cmp %5, %%"REG_a" \n\t"
1349 " jb 1b \n\t"
1350
1351 :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352 "m" (yalpha1), "m" (uvalpha1)
1353 : "%"REG_a
1354 );
1355 break;
1356#endif /* HAVE_MMX */
1357 case PIX_FMT_BGR32:
1358#ifndef HAVE_MMX
1359 case PIX_FMT_RGB32:
1360#endif
1361 if (dstFormat==PIX_FMT_RGB32)
1362 {
1363 int i;
1364#ifdef WORDS_BIGENDIAN
1365 dest++;
1366#endif
1367 for (i=0;i<dstW;i++){
1368 // vertical linear interpolation && yuv2rgb in a single step:
1369 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1375 dest+= 4;
1376 }
1377 }
1378 else if (dstFormat==PIX_FMT_BGR24)
1379 {
1380 int i;
1381 for (i=0;i<dstW;i++){
1382 // vertical linear interpolation && yuv2rgb in a single step:
1383 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386 dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387 dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388 dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1389 dest+= 3;
1390 }
1391 }
1392 else if (dstFormat==PIX_FMT_BGR565)
1393 {
1394 int i;
1395 for (i=0;i<dstW;i++){
1396 // vertical linear interpolation && yuv2rgb in a single step:
1397 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1400
1401 ((uint16_t*)dest)[i] =
1402 clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403 clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404 clip_table16r[(Y + yuvtab_3343[V]) >>13];
1405 }
1406 }
1407 else if (dstFormat==PIX_FMT_BGR555)
1408 {
1409 int i;
1410 for (i=0;i<dstW;i++){
1411 // vertical linear interpolation && yuv2rgb in a single step:
1412 int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413 int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414 int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1415
1416 ((uint16_t*)dest)[i] =
1417 clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418 clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419 clip_table15r[(Y + yuvtab_3343[V]) >>13];
1420 }
1421 }
1422 }//FULL_UV_IPOL
1423 else
1424 {
1425#endif // if 0
1426#ifdef HAVE_MMX
1427 switch(c->dstFormat)
1428 {
1429 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1430 case PIX_FMT_RGB32:
1431 asm volatile(
1432 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1433 "mov %4, %%"REG_b" \n\t"
1434 "push %%"REG_BP" \n\t"
1435 YSCALEYUV2RGB(%%REGBP, %5)
1436 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437 "pop %%"REG_BP" \n\t"
1438 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1439
1440 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441 "a" (&c->redDither)
1442 );
1443 return;
1444 case PIX_FMT_BGR24:
1445 asm volatile(
1446 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1447 "mov %4, %%"REG_b" \n\t"
1448 "push %%"REG_BP" \n\t"
1449 YSCALEYUV2RGB(%%REGBP, %5)
1450 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451 "pop %%"REG_BP" \n\t"
1452 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1453 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1454 "a" (&c->redDither)
1455 );
1456 return;
1457 case PIX_FMT_BGR555:
1458 asm volatile(
1459 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1460 "mov %4, %%"REG_b" \n\t"
1461 "push %%"REG_BP" \n\t"
1462 YSCALEYUV2RGB(%%REGBP, %5)
1463 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464#ifdef DITHER1XBPP
1465 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1466 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1467 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1468#endif
1469
1470 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1471 "pop %%"REG_BP" \n\t"
1472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1473
1474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475 "a" (&c->redDither)
1476 );
1477 return;
1478 case PIX_FMT_BGR565:
1479 asm volatile(
1480 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1481 "mov %4, %%"REG_b" \n\t"
1482 "push %%"REG_BP" \n\t"
1483 YSCALEYUV2RGB(%%REGBP, %5)
1484 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1485#ifdef DITHER1XBPP
1486 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1487 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1488 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1489#endif
1490
1491 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1492 "pop %%"REG_BP" \n\t"
1493 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1494 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495 "a" (&c->redDither)
1496 );
1497 return;
1498 case PIX_FMT_YUYV422:
1499 asm volatile(
1500 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1501 "mov %4, %%"REG_b" \n\t"
1502 "push %%"REG_BP" \n\t"
1503 YSCALEYUV2PACKED(%%REGBP, %5)
1504 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505 "pop %%"REG_BP" \n\t"
1506 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508 "a" (&c->redDither)
1509 );
1510 return;
1511 default: break;
1512 }
1513#endif //HAVE_MMX
1514YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1515}
1516
1517/**
1518 * YV12 to RGB without scaling or interpolating
1519 */
1520static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521 uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1522{
1523 const int yalpha1=0;
1524 int i;
1525
1526 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527 const int yalpha= 4096; //FIXME ...
1528
1529 if (flags&SWS_FULL_CHR_H_INT)
1530 {
1531 RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1532 return;
1533 }
1534
1535#ifdef HAVE_MMX
1536 if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1537 {
1538 switch(dstFormat)
1539 {
1540 case PIX_FMT_RGB32:
1541 asm volatile(
1542 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1543 "mov %4, %%"REG_b" \n\t"
1544 "push %%"REG_BP" \n\t"
1545 YSCALEYUV2RGB1(%%REGBP, %5)
1546 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547 "pop %%"REG_BP" \n\t"
1548 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1549
1550 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551 "a" (&c->redDither)
1552 );
1553 return;
1554 case PIX_FMT_BGR24:
1555 asm volatile(
1556 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1557 "mov %4, %%"REG_b" \n\t"
1558 "push %%"REG_BP" \n\t"
1559 YSCALEYUV2RGB1(%%REGBP, %5)
1560 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561 "pop %%"REG_BP" \n\t"
1562 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1563
1564 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565 "a" (&c->redDither)
1566 );
1567 return;
1568 case PIX_FMT_BGR555:
1569 asm volatile(
1570 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1571 "mov %4, %%"REG_b" \n\t"
1572 "push %%"REG_BP" \n\t"
1573 YSCALEYUV2RGB1(%%REGBP, %5)
1574 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575#ifdef DITHER1XBPP
1576 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1577 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1578 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1579#endif
1580 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1581 "pop %%"REG_BP" \n\t"
1582 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1583
1584 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1585 "a" (&c->redDither)
1586 );
1587 return;
1588 case PIX_FMT_BGR565:
1589 asm volatile(
1590 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1591 "mov %4, %%"REG_b" \n\t"
1592 "push %%"REG_BP" \n\t"
1593 YSCALEYUV2RGB1(%%REGBP, %5)
1594 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1595#ifdef DITHER1XBPP
1596 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1597 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1598 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1599#endif
1600
1601 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1602 "pop %%"REG_BP" \n\t"
1603 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1604
1605 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1606 "a" (&c->redDither)
1607 );
1608 return;
1609 case PIX_FMT_YUYV422:
1610 asm volatile(
1611 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1612 "mov %4, %%"REG_b" \n\t"
1613 "push %%"REG_BP" \n\t"
1614 YSCALEYUV2PACKED1(%%REGBP, %5)
1615 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616 "pop %%"REG_BP" \n\t"
1617 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1618
1619 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620 "a" (&c->redDither)
1621 );
1622 return;
1623 }
1624 }
1625 else
1626 {
1627 switch(dstFormat)
1628 {
1629 case PIX_FMT_RGB32:
1630 asm volatile(
1631 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1632 "mov %4, %%"REG_b" \n\t"
1633 "push %%"REG_BP" \n\t"
1634 YSCALEYUV2RGB1b(%%REGBP, %5)
1635 WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636 "pop %%"REG_BP" \n\t"
1637 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1638
1639 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1640 "a" (&c->redDither)
1641 );
1642 return;
1643 case PIX_FMT_BGR24:
1644 asm volatile(
1645 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1646 "mov %4, %%"REG_b" \n\t"
1647 "push %%"REG_BP" \n\t"
1648 YSCALEYUV2RGB1b(%%REGBP, %5)
1649 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650 "pop %%"REG_BP" \n\t"
1651 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1652
1653 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654 "a" (&c->redDither)
1655 );
1656 return;
1657 case PIX_FMT_BGR555:
1658 asm volatile(
1659 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1660 "mov %4, %%"REG_b" \n\t"
1661 "push %%"REG_BP" \n\t"
1662 YSCALEYUV2RGB1b(%%REGBP, %5)
1663 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664#ifdef DITHER1XBPP
1665 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1666 "paddusb "MANGLE(g5Dither)", %%mm4 \n\t"
1667 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1668#endif
1669 WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1670 "pop %%"REG_BP" \n\t"
1671 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1672
1673 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1674 "a" (&c->redDither)
1675 );
1676 return;
1677 case PIX_FMT_BGR565:
1678 asm volatile(
1679 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1680 "mov %4, %%"REG_b" \n\t"
1681 "push %%"REG_BP" \n\t"
1682 YSCALEYUV2RGB1b(%%REGBP, %5)
1683 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1684#ifdef DITHER1XBPP
1685 "paddusb "MANGLE(b5Dither)", %%mm2 \n\t"
1686 "paddusb "MANGLE(g6Dither)", %%mm4 \n\t"
1687 "paddusb "MANGLE(r5Dither)", %%mm5 \n\t"
1688#endif
1689
1690 WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1691 "pop %%"REG_BP" \n\t"
1692 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1693
1694 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1695 "a" (&c->redDither)
1696 );
1697 return;
1698 case PIX_FMT_YUYV422:
1699 asm volatile(
1700 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
1701 "mov %4, %%"REG_b" \n\t"
1702 "push %%"REG_BP" \n\t"
1703 YSCALEYUV2PACKED1b(%%REGBP, %5)
1704 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705 "pop %%"REG_BP" \n\t"
1706 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
1707
1708 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1709 "a" (&c->redDither)
1710 );
1711 return;
1712 }
1713 }
1714#endif /* HAVE_MMX */
1715 if (uvalpha < 2048)
1716 {
1717 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1718 }else{
1719 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1720 }
1721}
1722
1723//FIXME yuy2* can read up to 7 samples too much
1724
1725static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1726{
1727#ifdef HAVE_MMX
1728 asm volatile(
1729 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
1730 "mov %0, %%"REG_a" \n\t"
1731 "1: \n\t"
1732 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1733 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1734 "pand %%mm2, %%mm0 \n\t"
1735 "pand %%mm2, %%mm1 \n\t"
1736 "packuswb %%mm1, %%mm0 \n\t"
1737 "movq %%mm0, (%2, %%"REG_a") \n\t"
1738 "add $8, %%"REG_a" \n\t"
1739 " js 1b \n\t"
1740 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1741 : "%"REG_a
1742 );
1743#else
1744 int i;
1745 for (i=0; i<width; i++)
1746 dst[i]= src[2*i];
1747#endif
1748}
1749
1750static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1751{
1752#ifdef HAVE_MMX
1753 asm volatile(
1754 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1755 "mov %0, %%"REG_a" \n\t"
1756 "1: \n\t"
1757 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1758 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1759 "psrlw $8, %%mm0 \n\t"
1760 "psrlw $8, %%mm1 \n\t"
1761 "packuswb %%mm1, %%mm0 \n\t"
1762 "movq %%mm0, %%mm1 \n\t"
1763 "psrlw $8, %%mm0 \n\t"
1764 "pand %%mm4, %%mm1 \n\t"
1765 "packuswb %%mm0, %%mm0 \n\t"
1766 "packuswb %%mm1, %%mm1 \n\t"
1767 "movd %%mm0, (%3, %%"REG_a") \n\t"
1768 "movd %%mm1, (%2, %%"REG_a") \n\t"
1769 "add $4, %%"REG_a" \n\t"
1770 " js 1b \n\t"
1771 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1772 : "%"REG_a
1773 );
1774#else
1775 int i;
1776 for (i=0; i<width; i++)
1777 {
1778 dstU[i]= src1[4*i + 1];
1779 dstV[i]= src1[4*i + 3];
1780 }
1781#endif
1782 assert(src1 == src2);
1783}
1784
1785/* This is almost identical to the previous, end exists only because
1786 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1788{
1789#ifdef HAVE_MMX
1790 asm volatile(
1791 "mov %0, %%"REG_a" \n\t"
1792 "1: \n\t"
1793 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
1794 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
1795 "psrlw $8, %%mm0 \n\t"
1796 "psrlw $8, %%mm1 \n\t"
1797 "packuswb %%mm1, %%mm0 \n\t"
1798 "movq %%mm0, (%2, %%"REG_a") \n\t"
1799 "add $8, %%"REG_a" \n\t"
1800 " js 1b \n\t"
1801 : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1802 : "%"REG_a
1803 );
1804#else
1805 int i;
1806 for (i=0; i<width; i++)
1807 dst[i]= src[2*i+1];
1808#endif
1809}
1810
1811static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1812{
1813#ifdef HAVE_MMX
1814 asm volatile(
1815 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
1816 "mov %0, %%"REG_a" \n\t"
1817 "1: \n\t"
1818 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
1819 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
1820 "pand %%mm4, %%mm0 \n\t"
1821 "pand %%mm4, %%mm1 \n\t"
1822 "packuswb %%mm1, %%mm0 \n\t"
1823 "movq %%mm0, %%mm1 \n\t"
1824 "psrlw $8, %%mm0 \n\t"
1825 "pand %%mm4, %%mm1 \n\t"
1826 "packuswb %%mm0, %%mm0 \n\t"
1827 "packuswb %%mm1, %%mm1 \n\t"
1828 "movd %%mm0, (%3, %%"REG_a") \n\t"
1829 "movd %%mm1, (%2, %%"REG_a") \n\t"
1830 "add $4, %%"REG_a" \n\t"
1831 " js 1b \n\t"
1832 : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1833 : "%"REG_a
1834 );
1835#else
1836 int i;
1837 for (i=0; i<width; i++)
1838 {
1839 dstU[i]= src1[4*i + 0];
1840 dstV[i]= src1[4*i + 2];
1841 }
1842#endif
1843 assert(src1 == src2);
1844}
1845
1846static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1847{
1848 int i;
1849 for (i=0; i<width; i++)
1850 {
1851 int b= ((uint32_t*)src)[i]&0xFF;
1852 int g= (((uint32_t*)src)[i]>>8)&0xFF;
1853 int r= (((uint32_t*)src)[i]>>16)&0xFF;
1854
1855 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1856 }
1857}
1858
1859static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1860{
1861 int i;
1862 assert(src1 == src2);
1863 for (i=0; i<width; i++)
1864 {
1865 const int a= ((uint32_t*)src1)[2*i+0];
1866 const int e= ((uint32_t*)src1)[2*i+1];
1867 const int l= (a&0xFF00FF) + (e&0xFF00FF);
1868 const int h= (a&0x00FF00) + (e&0x00FF00);
1869 const int b= l&0x3FF;
1870 const int g= h>>8;
1871 const int r= l>>16;
1872
1873 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1874 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1875 }
1876}
1877
1878static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1879{
1880#ifdef HAVE_MMX
1881 asm volatile(
1882 "mov %2, %%"REG_a" \n\t"
1883 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1884 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1885 "pxor %%mm7, %%mm7 \n\t"
1886 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1887 ASMALIGN(4)
1888 "1: \n\t"
1889 PREFETCH" 64(%0, %%"REG_d") \n\t"
1890 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1891 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1892 "punpcklbw %%mm7, %%mm0 \n\t"
1893 "punpcklbw %%mm7, %%mm1 \n\t"
1894 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1895 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1896 "punpcklbw %%mm7, %%mm2 \n\t"
1897 "punpcklbw %%mm7, %%mm3 \n\t"
1898 "pmaddwd %%mm6, %%mm0 \n\t"
1899 "pmaddwd %%mm6, %%mm1 \n\t"
1900 "pmaddwd %%mm6, %%mm2 \n\t"
1901 "pmaddwd %%mm6, %%mm3 \n\t"
1902#ifndef FAST_BGR2YV12
1903 "psrad $8, %%mm0 \n\t"
1904 "psrad $8, %%mm1 \n\t"
1905 "psrad $8, %%mm2 \n\t"
1906 "psrad $8, %%mm3 \n\t"
1907#endif
1908 "packssdw %%mm1, %%mm0 \n\t"
1909 "packssdw %%mm3, %%mm2 \n\t"
1910 "pmaddwd %%mm5, %%mm0 \n\t"
1911 "pmaddwd %%mm5, %%mm2 \n\t"
1912 "packssdw %%mm2, %%mm0 \n\t"
1913 "psraw $7, %%mm0 \n\t"
1914
1915 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1916 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1917 "punpcklbw %%mm7, %%mm4 \n\t"
1918 "punpcklbw %%mm7, %%mm1 \n\t"
1919 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1920 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1921 "punpcklbw %%mm7, %%mm2 \n\t"
1922 "punpcklbw %%mm7, %%mm3 \n\t"
1923 "pmaddwd %%mm6, %%mm4 \n\t"
1924 "pmaddwd %%mm6, %%mm1 \n\t"
1925 "pmaddwd %%mm6, %%mm2 \n\t"
1926 "pmaddwd %%mm6, %%mm3 \n\t"
1927#ifndef FAST_BGR2YV12
1928 "psrad $8, %%mm4 \n\t"
1929 "psrad $8, %%mm1 \n\t"
1930 "psrad $8, %%mm2 \n\t"
1931 "psrad $8, %%mm3 \n\t"
1932#endif
1933 "packssdw %%mm1, %%mm4 \n\t"
1934 "packssdw %%mm3, %%mm2 \n\t"
1935 "pmaddwd %%mm5, %%mm4 \n\t"
1936 "pmaddwd %%mm5, %%mm2 \n\t"
1937 "add $24, %%"REG_d" \n\t"
1938 "packssdw %%mm2, %%mm4 \n\t"
1939 "psraw $7, %%mm4 \n\t"
1940
1941 "packuswb %%mm4, %%mm0 \n\t"
1942 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1943
1944 "movq %%mm0, (%1, %%"REG_a") \n\t"
1945 "add $8, %%"REG_a" \n\t"
1946 " js 1b \n\t"
1947 : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1948 : "%"REG_a, "%"REG_d
1949 );
1950#else
1951 int i;
1952 for (i=0; i<width; i++)
1953 {
1954 int b= src[i*3+0];
1955 int g= src[i*3+1];
1956 int r= src[i*3+2];
1957
1958 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1959 }
1960#endif /* HAVE_MMX */
1961}
1962
1963static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1964{
1965#ifdef HAVE_MMX
1966 asm volatile(
1967 "mov %3, %%"REG_a" \n\t"
1968 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1969 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1970 "pxor %%mm7, %%mm7 \n\t"
1971 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1972 "add %%"REG_d", %%"REG_d" \n\t"
1973 ASMALIGN(4)
1974 "1: \n\t"
1975 PREFETCH" 64(%0, %%"REG_d") \n\t"
1976#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1977 "movq (%0, %%"REG_d"), %%mm0 \n\t"
1978 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1979 "movq %%mm0, %%mm1 \n\t"
1980 "movq %%mm2, %%mm3 \n\t"
1981 "psrlq $24, %%mm0 \n\t"
1982 "psrlq $24, %%mm2 \n\t"
1983 PAVGB(%%mm1, %%mm0)
1984 PAVGB(%%mm3, %%mm2)
1985 "punpcklbw %%mm7, %%mm0 \n\t"
1986 "punpcklbw %%mm7, %%mm2 \n\t"
1987#else
1988 "movd (%0, %%"REG_d"), %%mm0 \n\t"
1989 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1990 "punpcklbw %%mm7, %%mm0 \n\t"
1991 "punpcklbw %%mm7, %%mm2 \n\t"
1992 "paddw %%mm2, %%mm0 \n\t"
1993 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1994 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1995 "punpcklbw %%mm7, %%mm4 \n\t"
1996 "punpcklbw %%mm7, %%mm2 \n\t"
1997 "paddw %%mm4, %%mm2 \n\t"
1998 "psrlw $1, %%mm0 \n\t"
1999 "psrlw $1, %%mm2 \n\t"
2000#endif
2001 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2002 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2003
2004 "pmaddwd %%mm0, %%mm1 \n\t"
2005 "pmaddwd %%mm2, %%mm3 \n\t"
2006 "pmaddwd %%mm6, %%mm0 \n\t"
2007 "pmaddwd %%mm6, %%mm2 \n\t"
2008#ifndef FAST_BGR2YV12
2009 "psrad $8, %%mm0 \n\t"
2010 "psrad $8, %%mm1 \n\t"
2011 "psrad $8, %%mm2 \n\t"
2012 "psrad $8, %%mm3 \n\t"
2013#endif
2014 "packssdw %%mm2, %%mm0 \n\t"
2015 "packssdw %%mm3, %%mm1 \n\t"
2016 "pmaddwd %%mm5, %%mm0 \n\t"
2017 "pmaddwd %%mm5, %%mm1 \n\t"
2018 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2019 "psraw $7, %%mm0 \n\t"
2020
2021#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2022 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2023 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2024 "movq %%mm4, %%mm1 \n\t"
2025 "movq %%mm2, %%mm3 \n\t"
2026 "psrlq $24, %%mm4 \n\t"
2027 "psrlq $24, %%mm2 \n\t"
2028 PAVGB(%%mm1, %%mm4)
2029 PAVGB(%%mm3, %%mm2)
2030 "punpcklbw %%mm7, %%mm4 \n\t"
2031 "punpcklbw %%mm7, %%mm2 \n\t"
2032#else
2033 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2034 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2035 "punpcklbw %%mm7, %%mm4 \n\t"
2036 "punpcklbw %%mm7, %%mm2 \n\t"
2037 "paddw %%mm2, %%mm4 \n\t"
2038 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2039 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2040 "punpcklbw %%mm7, %%mm5 \n\t"
2041 "punpcklbw %%mm7, %%mm2 \n\t"
2042 "paddw %%mm5, %%mm2 \n\t"
2043 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2044 "psrlw $2, %%mm4 \n\t"
2045 "psrlw $2, %%mm2 \n\t"
2046#endif
2047 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2048 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2049
2050 "pmaddwd %%mm4, %%mm1 \n\t"
2051 "pmaddwd %%mm2, %%mm3 \n\t"
2052 "pmaddwd %%mm6, %%mm4 \n\t"
2053 "pmaddwd %%mm6, %%mm2 \n\t"
2054#ifndef FAST_BGR2YV12
2055 "psrad $8, %%mm4 \n\t"
2056 "psrad $8, %%mm1 \n\t"
2057 "psrad $8, %%mm2 \n\t"
2058 "psrad $8, %%mm3 \n\t"
2059#endif
2060 "packssdw %%mm2, %%mm4 \n\t"
2061 "packssdw %%mm3, %%mm1 \n\t"
2062 "pmaddwd %%mm5, %%mm4 \n\t"
2063 "pmaddwd %%mm5, %%mm1 \n\t"
2064 "add $24, %%"REG_d" \n\t"
2065 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2066 "psraw $7, %%mm4 \n\t"
2067
2068 "movq %%mm0, %%mm1 \n\t"
2069 "punpckldq %%mm4, %%mm0 \n\t"
2070 "punpckhdq %%mm4, %%mm1 \n\t"
2071 "packsswb %%mm1, %%mm0 \n\t"
2072 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2073
2074 "movd %%mm0, (%1, %%"REG_a") \n\t"
2075 "punpckhdq %%mm0, %%mm0 \n\t"
2076 "movd %%mm0, (%2, %%"REG_a") \n\t"
2077 "add $4, %%"REG_a" \n\t"
2078 " js 1b \n\t"
2079 : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2080 : "%"REG_a, "%"REG_d
2081 );
2082#else
2083 int i;
2084 for (i=0; i<width; i++)
2085 {
2086 int b= src1[6*i + 0] + src1[6*i + 3];
2087 int g= src1[6*i + 1] + src1[6*i + 4];
2088 int r= src1[6*i + 2] + src1[6*i + 5];
2089
2090 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2091 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2092 }
2093#endif /* HAVE_MMX */
2094 assert(src1 == src2);
2095}
2096
2097static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2098{
2099 int i;
2100 for (i=0; i<width; i++)
2101 {
2102 int d= ((uint16_t*)src)[i];
2103 int b= d&0x1F;
2104 int g= (d>>5)&0x3F;
2105 int r= (d>>11)&0x1F;
2106
2107 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2108 }
2109}
2110
2111static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2112{
2113 int i;
2114 assert(src1==src2);
2115 for (i=0; i<width; i++)
2116 {
2117 int d0= ((uint32_t*)src1)[i];
2118
2119 int dl= (d0&0x07E0F81F);
2120 int dh= ((d0>>5)&0x07C0F83F);
2121
2122 int dh2= (dh>>11) + (dh<<21);
2123 int d= dh2 + dl;
2124
2125 int b= d&0x7F;
2126 int r= (d>>11)&0x7F;
2127 int g= d>>21;
2128 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2129 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2130 }
2131}
2132
2133static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2134{
2135 int i;
2136 for (i=0; i<width; i++)
2137 {
2138 int d= ((uint16_t*)src)[i];
2139 int b= d&0x1F;
2140 int g= (d>>5)&0x1F;
2141 int r= (d>>10)&0x1F;
2142
2143 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2144 }
2145}
2146
2147static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2148{
2149 int i;
2150 assert(src1==src2);
2151 for (i=0; i<width; i++)
2152 {
2153 int d0= ((uint32_t*)src1)[i];
2154
2155 int dl= (d0&0x03E07C1F);
2156 int dh= ((d0>>5)&0x03E0F81F);
2157
2158 int dh2= (dh>>11) + (dh<<21);
2159 int d= dh2 + dl;
2160
2161 int b= d&0x7F;
2162 int r= (d>>10)&0x7F;
2163 int g= d>>21;
2164 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2165 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2166 }
2167}
2168
2169
2170static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2171{
2172 int i;
2173 for (i=0; i<width; i++)
2174 {
2175 int r= ((uint32_t*)src)[i]&0xFF;
2176 int g= (((uint32_t*)src)[i]>>8)&0xFF;
2177 int b= (((uint32_t*)src)[i]>>16)&0xFF;
2178
2179 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2180 }
2181}
2182
2183static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2184{
2185 int i;
2186 assert(src1==src2);
2187 for (i=0; i<width; i++)
2188 {
2189 const int a= ((uint32_t*)src1)[2*i+0];
2190 const int e= ((uint32_t*)src1)[2*i+1];
2191 const int l= (a&0xFF00FF) + (e&0xFF00FF);
2192 const int h= (a&0x00FF00) + (e&0x00FF00);
2193 const int r= l&0x3FF;
2194 const int g= h>>8;
2195 const int b= l>>16;
2196
2197 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2198 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2199 }
2200}
2201
2202static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2203{
2204 int i;
2205 for (i=0; i<width; i++)
2206 {
2207 int r= src[i*3+0];
2208 int g= src[i*3+1];
2209 int b= src[i*3+2];
2210
2211 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2212 }
2213}
2214
2215static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2216{
2217 int i;
2218 assert(src1==src2);
2219 for (i=0; i<width; i++)
2220 {
2221 int r= src1[6*i + 0] + src1[6*i + 3];
2222 int g= src1[6*i + 1] + src1[6*i + 4];
2223 int b= src1[6*i + 2] + src1[6*i + 5];
2224
2225 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2226 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2227 }
2228}
2229
2230static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2231{
2232 int i;
2233 for (i=0; i<width; i++)
2234 {
2235 int d= ((uint16_t*)src)[i];
2236 int r= d&0x1F;
2237 int g= (d>>5)&0x3F;
2238 int b= (d>>11)&0x1F;
2239
2240 dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2241 }
2242}
2243
2244static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2245{
2246 int i;
2247 assert(src1 == src2);
2248 for (i=0; i<width; i++)
2249 {
2250 int d0= ((uint32_t*)src1)[i];
2251
2252 int dl= (d0&0x07E0F81F);
2253 int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2254
2255 int r= d&0x3F;
2256 int b= (d>>11)&0x3F;
2257 int g= d>>21;
2258 dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2259 dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2260 }
2261}
2262
2263static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2264{
2265 int i;
2266 for (i=0; i<width; i++)
2267 {
2268 int d= ((uint16_t*)src)[i];
2269 int r= d&0x1F;
2270 int g= (d>>5)&0x1F;
2271 int b= (d>>10)&0x1F;
2272
2273 dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2274 }
2275}
2276
2277static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2278{
2279 int i;
2280 assert(src1 == src2);
2281 for (i=0; i<width; i++)
2282 {
2283 int d0= ((uint32_t*)src1)[i];
2284
2285 int dl= (d0&0x03E07C1F);
2286 int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2287
2288 int r= d&0x3F;
2289 int b= (d>>10)&0x3F;
2290 int g= d>>21;
2291 dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2292 dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2293 }
2294}
2295
2296static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2297{
2298 int i;
2299 for (i=0; i<width; i++)
2300 {
2301 int d= src[i];
2302
2303 dst[i]= pal[d] & 0xFF;
2304 }
2305}
2306
2307static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2308{
2309 int i;
2310 assert(src1 == src2);
2311 for (i=0; i<width; i++)
2312 {
2313 int p= pal[src1[i]];
2314
2315 dstU[i]= p>>8;
2316 dstV[i]= p>>16;
2317 }
2318}
2319
2320// bilinear / bicubic scaling
2321static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2322 int16_t *filter, int16_t *filterPos, long filterSize)
2323{
2324#ifdef HAVE_MMX
2325 assert(filterSize % 4 == 0 && filterSize>0);
2326 if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2327 {
2328 long counter= -2*dstW;
2329 filter-= counter*2;
2330 filterPos-= counter/2;
2331 dst-= counter/2;
2332 asm volatile(
2333#if defined(PIC)
2334 "push %%"REG_b" \n\t"
2335#endif
2336 "pxor %%mm7, %%mm7 \n\t"
2337 "movq "MANGLE(w02)", %%mm6 \n\t"
2338 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2339 "mov %%"REG_a", %%"REG_BP" \n\t"
2340 ASMALIGN(4)
2341 "1: \n\t"
2342 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2343 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2344 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
2345 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
2346 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2347 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2348 "punpcklbw %%mm7, %%mm0 \n\t"
2349 "punpcklbw %%mm7, %%mm2 \n\t"
2350 "pmaddwd %%mm1, %%mm0 \n\t"
2351 "pmaddwd %%mm2, %%mm3 \n\t"
2352 "psrad $8, %%mm0 \n\t"
2353 "psrad $8, %%mm3 \n\t"
2354 "packssdw %%mm3, %%mm0 \n\t"
2355 "pmaddwd %%mm6, %%mm0 \n\t"
2356 "packssdw %%mm0, %%mm0 \n\t"
2357 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2358 "add $4, %%"REG_BP" \n\t"
2359 " jnc 1b \n\t"
2360
2361 "pop %%"REG_BP" \n\t"
2362#if defined(PIC)
2363 "pop %%"REG_b" \n\t"
2364#endif
2365 : "+a" (counter)
2366 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2367#if !defined(PIC)
2368 : "%"REG_b
2369#endif
2370 );
2371 }
2372 else if (filterSize==8)
2373 {
2374 long counter= -2*dstW;
2375 filter-= counter*4;
2376 filterPos-= counter/2;
2377 dst-= counter/2;
2378 asm volatile(
2379#if defined(PIC)
2380 "push %%"REG_b" \n\t"
2381#endif
2382 "pxor %%mm7, %%mm7 \n\t"
2383 "movq "MANGLE(w02)", %%mm6 \n\t"
2384 "push %%"REG_BP" \n\t" // we use 7 regs here ...
2385 "mov %%"REG_a", %%"REG_BP" \n\t"
2386 ASMALIGN(4)
2387 "1: \n\t"
2388 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
2389 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
2390 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
2391 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
2392 "movd (%3, %%"REG_a"), %%mm0 \n\t"
2393 "movd (%3, %%"REG_b"), %%mm2 \n\t"
2394 "punpcklbw %%mm7, %%mm0 \n\t"
2395 "punpcklbw %%mm7, %%mm2 \n\t"
2396 "pmaddwd %%mm1, %%mm0 \n\t"
2397 "pmaddwd %%mm2, %%mm3 \n\t"
2398
2399 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
2400 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
2401 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
2402 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
2403 "punpcklbw %%mm7, %%mm4 \n\t"
2404 "punpcklbw %%mm7, %%mm2 \n\t"
2405 "pmaddwd %%mm1, %%mm4 \n\t"
2406 "pmaddwd %%mm2, %%mm5 \n\t"
2407 "paddd %%mm4, %%mm0 \n\t"
2408 "paddd %%mm5, %%mm3 \n\t"
2409
2410 "psrad $8, %%mm0 \n\t"
2411 "psrad $8, %%mm3 \n\t"
2412 "packssdw %%mm3, %%mm0 \n\t"
2413 "pmaddwd %%mm6, %%mm0 \n\t"
2414 "packssdw %%mm0, %%mm0 \n\t"
2415 "movd %%mm0, (%4, %%"REG_BP") \n\t"
2416 "add $4, %%"REG_BP" \n\t"
2417 " jnc 1b \n\t"
2418
2419 "pop %%"REG_BP" \n\t"
2420#if defined(PIC)
2421 "pop %%"REG_b" \n\t"
2422#endif
2423 : "+a" (counter)
2424 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2425#if !defined(PIC)
2426 : "%"REG_b
2427#endif
2428 );
2429 }
2430 else
2431 {
2432 uint8_t *offset = src+filterSize;
2433 long counter= -2*dstW;
2434 //filter-= counter*filterSize/2;
2435 filterPos-= counter/2;
2436 dst-= counter/2;
2437 asm volatile(
2438 "pxor %%mm7, %%mm7 \n\t"
2439 "movq "MANGLE(w02)", %%mm6 \n\t"
2440 ASMALIGN(4)
2441 "1: \n\t"
2442 "mov %2, %%"REG_c" \n\t"
2443 "movzwl (%%"REG_c", %0), %%eax \n\t"
2444 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
2445 "mov %5, %%"REG_c" \n\t"
2446 "pxor %%mm4, %%mm4 \n\t"
2447 "pxor %%mm5, %%mm5 \n\t"
2448 "2: \n\t"
2449 "movq (%1), %%mm1 \n\t"
2450 "movq (%1, %6), %%mm3 \n\t"
2451 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
2452 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
2453 "punpcklbw %%mm7, %%mm0 \n\t"
2454 "punpcklbw %%mm7, %%mm2 \n\t"
2455 "pmaddwd %%mm1, %%mm0 \n\t"
2456 "pmaddwd %%mm2, %%mm3 \n\t"
2457 "paddd %%mm3, %%mm5 \n\t"
2458 "paddd %%mm0, %%mm4 \n\t"
2459 "add $8, %1 \n\t"
2460 "add $4, %%"REG_c" \n\t"
2461 "cmp %4, %%"REG_c" \n\t"
2462 " jb 2b \n\t"
2463 "add %6, %1 \n\t"
2464 "psrad $8, %%mm4 \n\t"
2465 "psrad $8, %%mm5 \n\t"
2466 "packssdw %%mm5, %%mm4 \n\t"
2467 "pmaddwd %%mm6, %%mm4 \n\t"
2468 "packssdw %%mm4, %%mm4 \n\t"
2469 "mov %3, %%"REG_a" \n\t"
2470 "movd %%mm4, (%%"REG_a", %0) \n\t"
2471 "add $4, %0 \n\t"
2472 " jnc 1b \n\t"
2473
2474 : "+r" (counter), "+r" (filter)
2475 : "m" (filterPos), "m" (dst), "m"(offset),
2476 "m" (src), "r" (filterSize*2)
2477 : "%"REG_a, "%"REG_c, "%"REG_d
2478 );
2479 }
2480#else
2481#ifdef HAVE_ALTIVEC
2482 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2483#else
2484 int i;
2485 for (i=0; i<dstW; i++)
2486 {
2487 int j;
2488 int srcPos= filterPos[i];
2489 int val=0;
2490 //printf("filterPos: %d\n", filterPos[i]);
2491 for (j=0; j<filterSize; j++)
2492 {
2493 //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2494 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2495 }
2496 //filter += hFilterSize;
2497 dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2498 //dst[i] = val>>7;
2499 }
2500#endif /* HAVE_ALTIVEC */
2501#endif /* HAVE_MMX */
2502}
2503 // *** horizontal scale Y line to temp buffer
2504static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2505 int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2506 int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2507 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2508 int32_t *mmx2FilterPos, uint8_t *pal)
2509{
2510 if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2511 {
2512 RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2513 src= formatConvBuffer;
2514 }
2515 else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2516 {
2517 RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2518 src= formatConvBuffer;
2519 }
2520 else if (srcFormat==PIX_FMT_RGB32)
2521 {
2522 RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2523 src= formatConvBuffer;
2524 }
2525 else if (srcFormat==PIX_FMT_BGR24)
2526 {
2527 RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2528 src= formatConvBuffer;
2529 }
2530 else if (srcFormat==PIX_FMT_BGR565)
2531 {
2532 RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2533 src= formatConvBuffer;
2534 }
2535 else if (srcFormat==PIX_FMT_BGR555)
2536 {
2537 RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2538 src= formatConvBuffer;
2539 }
2540 else if (srcFormat==PIX_FMT_BGR32)
2541 {
2542 RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2543 src= formatConvBuffer;
2544 }
2545 else if (srcFormat==PIX_FMT_RGB24)
2546 {
2547 RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2548 src= formatConvBuffer;
2549 }
2550 else if (srcFormat==PIX_FMT_RGB565)
2551 {
2552 RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2553 src= formatConvBuffer;
2554 }
2555 else if (srcFormat==PIX_FMT_RGB555)
2556 {
2557 RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2558 src= formatConvBuffer;
2559 }
2560 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2561 {
2562 RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2563 src= formatConvBuffer;
2564 }
2565
2566#ifdef HAVE_MMX
2567 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2568 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2569#else
2570 if (!(flags&SWS_FAST_BILINEAR))
2571#endif
2572 {
2573 RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2574 }
2575 else // fast bilinear upscale / crap downscale
2576 {
2577#if defined(ARCH_X86)
2578#ifdef HAVE_MMX2
2579 int i;
2580#if defined(PIC)
2581 uint64_t ebxsave __attribute__((aligned(8)));
2582#endif
2583 if (canMMX2BeUsed)
2584 {
2585 asm volatile(
2586#if defined(PIC)
2587 "mov %%"REG_b", %5 \n\t"
2588#endif
2589 "pxor %%mm7, %%mm7 \n\t"
2590 "mov %0, %%"REG_c" \n\t"
2591 "mov %1, %%"REG_D" \n\t"
2592 "mov %2, %%"REG_d" \n\t"
2593 "mov %3, %%"REG_b" \n\t"
2594 "xor %%"REG_a", %%"REG_a" \n\t" // i
2595 PREFETCH" (%%"REG_c") \n\t"
2596 PREFETCH" 32(%%"REG_c") \n\t"
2597 PREFETCH" 64(%%"REG_c") \n\t"
2598
2599#ifdef ARCH_X86_64
2600
2601#define FUNNY_Y_CODE \
2602 "movl (%%"REG_b"), %%esi \n\t"\
2603 "call *%4 \n\t"\
2604 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2605 "add %%"REG_S", %%"REG_c" \n\t"\
2606 "add %%"REG_a", %%"REG_D" \n\t"\
2607 "xor %%"REG_a", %%"REG_a" \n\t"\
2608
2609#else
2610
2611#define FUNNY_Y_CODE \
2612 "movl (%%"REG_b"), %%esi \n\t"\
2613 "call *%4 \n\t"\
2614 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2615 "add %%"REG_a", %%"REG_D" \n\t"\
2616 "xor %%"REG_a", %%"REG_a" \n\t"\
2617
2618#endif /* ARCH_X86_64 */
2619
2620FUNNY_Y_CODE
2621FUNNY_Y_CODE
2622FUNNY_Y_CODE
2623FUNNY_Y_CODE
2624FUNNY_Y_CODE
2625FUNNY_Y_CODE
2626FUNNY_Y_CODE
2627FUNNY_Y_CODE
2628
2629#if defined(PIC)
2630 "mov %5, %%"REG_b" \n\t"
2631#endif
2632 :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2633 "m" (funnyYCode)
2634#if defined(PIC)
2635 ,"m" (ebxsave)
2636#endif
2637 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2638#if !defined(PIC)
2639 ,"%"REG_b
2640#endif
2641 );
2642 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2643 }
2644 else
2645 {
2646#endif /* HAVE_MMX2 */
2647 long xInc_shr16 = xInc >> 16;
2648 uint16_t xInc_mask = xInc & 0xffff;
2649 //NO MMX just normal asm ...
2650 asm volatile(
2651 "xor %%"REG_a", %%"REG_a" \n\t" // i
2652 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2653 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2654 ASMALIGN(4)
2655 "1: \n\t"
2656 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2657 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2658 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2659 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2660 "shll $16, %%edi \n\t"
2661 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2662 "mov %1, %%"REG_D" \n\t"
2663 "shrl $9, %%esi \n\t"
2664 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2665 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2666 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2667
2668 "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx]
2669 "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1]
2670 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2671 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2672 "shll $16, %%edi \n\t"
2673 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2674 "mov %1, %%"REG_D" \n\t"
2675 "shrl $9, %%esi \n\t"
2676 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
2677 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2678 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2679
2680
2681 "add $2, %%"REG_a" \n\t"
2682 "cmp %2, %%"REG_a" \n\t"
2683 " jb 1b \n\t"
2684
2685
2686 :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2687 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2688 );
2689#ifdef HAVE_MMX2
2690 } //if MMX2 can't be used
2691#endif
2692#else
2693 int i;
2694 unsigned int xpos=0;
2695 for (i=0;i<dstWidth;i++)
2696 {
2697 register unsigned int xx=xpos>>16;
2698 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2699 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2700 xpos+=xInc;
2701 }
2702#endif /* defined(ARCH_X86) */
2703 }
2704}
2705
2706inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2707 int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2708 int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2709 int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2710 int32_t *mmx2FilterPos, uint8_t *pal)
2711{
2712 if (srcFormat==PIX_FMT_YUYV422)
2713 {
2714 RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2715 src1= formatConvBuffer;
2716 src2= formatConvBuffer+VOFW;
2717 }
2718 else if (srcFormat==PIX_FMT_UYVY422)
2719 {
2720 RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2721 src1= formatConvBuffer;
2722 src2= formatConvBuffer+VOFW;
2723 }
2724 else if (srcFormat==PIX_FMT_RGB32)
2725 {
2726 RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2727 src1= formatConvBuffer;
2728 src2= formatConvBuffer+VOFW;
2729 }
2730 else if (srcFormat==PIX_FMT_BGR24)
2731 {
2732 RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2733 src1= formatConvBuffer;
2734 src2= formatConvBuffer+VOFW;
2735 }
2736 else if (srcFormat==PIX_FMT_BGR565)
2737 {
2738 RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2739 src1= formatConvBuffer;
2740 src2= formatConvBuffer+VOFW;
2741 }
2742 else if (srcFormat==PIX_FMT_BGR555)
2743 {
2744 RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2745 src1= formatConvBuffer;
2746 src2= formatConvBuffer+VOFW;
2747 }
2748 else if (srcFormat==PIX_FMT_BGR32)
2749 {
2750 RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2751 src1= formatConvBuffer;
2752 src2= formatConvBuffer+VOFW;
2753 }
2754 else if (srcFormat==PIX_FMT_RGB24)
2755 {
2756 RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2757 src1= formatConvBuffer;
2758 src2= formatConvBuffer+VOFW;
2759 }
2760 else if (srcFormat==PIX_FMT_RGB565)
2761 {
2762 RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2763 src1= formatConvBuffer;
2764 src2= formatConvBuffer+VOFW;
2765 }
2766 else if (srcFormat==PIX_FMT_RGB555)
2767 {
2768 RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2769 src1= formatConvBuffer;
2770 src2= formatConvBuffer+VOFW;
2771 }
2772 else if (isGray(srcFormat))
2773 {
2774 return;
2775 }
2776 else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE)
2777 {
2778 RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2779 src1= formatConvBuffer;
2780 src2= formatConvBuffer+VOFW;
2781 }
2782
2783#ifdef HAVE_MMX
2784 // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2785 if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2786#else
2787 if (!(flags&SWS_FAST_BILINEAR))
2788#endif
2789 {
2790 RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2791 RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2792 }
2793 else // fast bilinear upscale / crap downscale
2794 {
2795#if defined(ARCH_X86)
2796#ifdef HAVE_MMX2
2797 int i;
2798#if defined(PIC)
2799 uint64_t ebxsave __attribute__((aligned(8)));
2800#endif
2801 if (canMMX2BeUsed)
2802 {
2803 asm volatile(
2804#if defined(PIC)
2805 "mov %%"REG_b", %6 \n\t"
2806#endif
2807 "pxor %%mm7, %%mm7 \n\t"
2808 "mov %0, %%"REG_c" \n\t"
2809 "mov %1, %%"REG_D" \n\t"
2810 "mov %2, %%"REG_d" \n\t"
2811 "mov %3, %%"REG_b" \n\t"
2812 "xor %%"REG_a", %%"REG_a" \n\t" // i
2813 PREFETCH" (%%"REG_c") \n\t"
2814 PREFETCH" 32(%%"REG_c") \n\t"
2815 PREFETCH" 64(%%"REG_c") \n\t"
2816
2817#ifdef ARCH_X86_64
2818
2819#define FUNNY_UV_CODE \
2820 "movl (%%"REG_b"), %%esi \n\t"\
2821 "call *%4 \n\t"\
2822 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2823 "add %%"REG_S", %%"REG_c" \n\t"\
2824 "add %%"REG_a", %%"REG_D" \n\t"\
2825 "xor %%"REG_a", %%"REG_a" \n\t"\
2826
2827#else
2828
2829#define FUNNY_UV_CODE \
2830 "movl (%%"REG_b"), %%esi \n\t"\
2831 "call *%4 \n\t"\
2832 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2833 "add %%"REG_a", %%"REG_D" \n\t"\
2834 "xor %%"REG_a", %%"REG_a" \n\t"\
2835
2836#endif /* ARCH_X86_64 */
2837
2838FUNNY_UV_CODE
2839FUNNY_UV_CODE
2840FUNNY_UV_CODE
2841FUNNY_UV_CODE
2842 "xor %%"REG_a", %%"REG_a" \n\t" // i
2843 "mov %5, %%"REG_c" \n\t" // src
2844 "mov %1, %%"REG_D" \n\t" // buf1
2845 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
2846 PREFETCH" (%%"REG_c") \n\t"
2847 PREFETCH" 32(%%"REG_c") \n\t"
2848 PREFETCH" 64(%%"REG_c") \n\t"
2849
2850FUNNY_UV_CODE
2851FUNNY_UV_CODE
2852FUNNY_UV_CODE
2853FUNNY_UV_CODE
2854
2855#if defined(PIC)
2856 "mov %6, %%"REG_b" \n\t"
2857#endif
2858 :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2859 "m" (funnyUVCode), "m" (src2)
2860#if defined(PIC)
2861 ,"m" (ebxsave)
2862#endif
2863 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2864#if !defined(PIC)
2865 ,"%"REG_b
2866#endif
2867 );
2868 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2869 {
2870 //printf("%d %d %d\n", dstWidth, i, srcW);
2871 dst[i] = src1[srcW-1]*128;
2872 dst[i+VOFW] = src2[srcW-1]*128;
2873 }
2874 }
2875 else
2876 {
2877#endif /* HAVE_MMX2 */
2878 long xInc_shr16 = (long) (xInc >> 16);
2879 uint16_t xInc_mask = xInc & 0xffff;
2880 asm volatile(
2881 "xor %%"REG_a", %%"REG_a" \n\t" // i
2882 "xor %%"REG_d", %%"REG_d" \n\t" // xx
2883 "xorl %%ecx, %%ecx \n\t" // 2*xalpha
2884 ASMALIGN(4)
2885 "1: \n\t"
2886 "mov %0, %%"REG_S" \n\t"
2887 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx]
2888 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1]
2889 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2890 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2891 "shll $16, %%edi \n\t"
2892 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2893 "mov %1, %%"REG_D" \n\t"
2894 "shrl $9, %%esi \n\t"
2895 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
2896
2897 "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx]
2898 "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1]
2899 "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx]
2900 "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha
2901 "shll $16, %%edi \n\t"
2902 "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2903 "mov %1, %%"REG_D" \n\t"
2904 "shrl $9, %%esi \n\t"
2905 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
2906
2907 "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF
2908 "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry
2909 "add $1, %%"REG_a" \n\t"
2910 "cmp %2, %%"REG_a" \n\t"
2911 " jb 1b \n\t"
2912
2913/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2914 which is needed to support GCC 4.0. */
2915#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2916 :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2917#else
2918 :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2919#endif
2920 "r" (src2)
2921 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2922 );
2923#ifdef HAVE_MMX2
2924 } //if MMX2 can't be used
2925#endif
2926#else
2927 int i;
2928 unsigned int xpos=0;
2929 for (i=0;i<dstWidth;i++)
2930 {
2931 register unsigned int xx=xpos>>16;
2932 register unsigned int xalpha=(xpos&0xFFFF)>>9;
2933 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2934 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2935 /* slower
2936 dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2937 dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2938 */
2939 xpos+=xInc;
2940 }
2941#endif /* defined(ARCH_X86) */
2942 }
2943}
2944
2945static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2946 int srcSliceH, uint8_t* dst[], int dstStride[]){
2947
2948 /* load a few things into local vars to make the code more readable? and faster */
2949 const int srcW= c->srcW;
2950 const int dstW= c->dstW;
2951 const int dstH= c->dstH;
2952 const int chrDstW= c->chrDstW;
2953 const int chrSrcW= c->chrSrcW;
2954 const int lumXInc= c->lumXInc;
2955 const int chrXInc= c->chrXInc;
2956 const int dstFormat= c->dstFormat;
2957 const int srcFormat= c->srcFormat;
2958 const int flags= c->flags;
2959 const int canMMX2BeUsed= c->canMMX2BeUsed;
2960 int16_t *vLumFilterPos= c->vLumFilterPos;
2961 int16_t *vChrFilterPos= c->vChrFilterPos;
2962 int16_t *hLumFilterPos= c->hLumFilterPos;
2963 int16_t *hChrFilterPos= c->hChrFilterPos;
2964 int16_t *vLumFilter= c->vLumFilter;
2965 int16_t *vChrFilter= c->vChrFilter;
2966 int16_t *hLumFilter= c->hLumFilter;
2967 int16_t *hChrFilter= c->hChrFilter;
2968 int32_t *lumMmxFilter= c->lumMmxFilter;
2969 int32_t *chrMmxFilter= c->chrMmxFilter;
2970 const int vLumFilterSize= c->vLumFilterSize;
2971 const int vChrFilterSize= c->vChrFilterSize;
2972 const int hLumFilterSize= c->hLumFilterSize;
2973 const int hChrFilterSize= c->hChrFilterSize;
2974 int16_t **lumPixBuf= c->lumPixBuf;
2975 int16_t **chrPixBuf= c->chrPixBuf;
2976 const int vLumBufSize= c->vLumBufSize;
2977 const int vChrBufSize= c->vChrBufSize;
2978 uint8_t *funnyYCode= c->funnyYCode;
2979 uint8_t *funnyUVCode= c->funnyUVCode;
2980 uint8_t *formatConvBuffer= c->formatConvBuffer;
2981 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2982 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2983 int lastDstY;
2984 uint8_t *pal=NULL;
2985
2986 /* vars which will change and which we need to store back in the context */
2987 int dstY= c->dstY;
2988 int lumBufIndex= c->lumBufIndex;
2989 int chrBufIndex= c->chrBufIndex;
2990 int lastInLumBuf= c->lastInLumBuf;
2991 int lastInChrBuf= c->lastInChrBuf;
2992
2993 if (isPacked(c->srcFormat)){
2994 pal= src[1];
2995 src[0]=
2996 src[1]=
2997 src[2]= src[0];
2998 srcStride[0]=
2999 srcStride[1]=
3000 srcStride[2]= srcStride[0];
3001 }
3002 srcStride[1]<<= c->vChrDrop;
3003 srcStride[2]<<= c->vChrDrop;
3004
3005 //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
3006 // (int)dst[0], (int)dst[1], (int)dst[2]);
3007
3008#if 0 //self test FIXME move to a vfilter or something
3009 {
3010 static volatile int i=0;
3011 i++;
3012 if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
3013 selfTest(src, srcStride, c->srcW, c->srcH);
3014 i--;
3015 }
3016#endif
3017
3018 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3019 //dstStride[0],dstStride[1],dstStride[2]);
3020
3021 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3022 {
3023 static int firstTime=1; //FIXME move this into the context perhaps
3024 if (flags & SWS_PRINT_INFO && firstTime)
3025 {
3026 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3027 " ->cannot do aligned memory accesses anymore\n");
3028 firstTime=0;
3029 }
3030 }
3031
3032 /* Note the user might start scaling the picture in the middle so this
3033 will not get executed. This is not really intended but works
3034 currently, so people might do it. */
3035 if (srcSliceY ==0){
3036 lumBufIndex=0;
3037 chrBufIndex=0;
3038 dstY=0;
3039 lastInLumBuf= -1;
3040 lastInChrBuf= -1;
3041 }
3042
3043 lastDstY= dstY;
3044
3045 for (;dstY < dstH; dstY++){
3046 unsigned char *dest =dst[0]+dstStride[0]*dstY;
3047 const int chrDstY= dstY>>c->chrDstVSubSample;
3048 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3049 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3050
3051 const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3052 const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3053 const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3054 const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3055
3056 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3057 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample);
3058 //handle holes (FAST_BILINEAR & weird filters)
3059 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3060 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3061 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3062 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3063 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3064
3065 // Do we have enough lines in this slice to output the dstY line
3066 if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3067 {
3068 //Do horizontal scaling
3069 while(lastInLumBuf < lastLumSrcY)
3070 {
3071 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3072 lumBufIndex++;
3073 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY);
3074 assert(lumBufIndex < 2*vLumBufSize);
3075 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3076 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3077 //printf("%d %d\n", lumBufIndex, vLumBufSize);
3078 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3079 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3080 funnyYCode, c->srcFormat, formatConvBuffer,
3081 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3082 lastInLumBuf++;
3083 }
3084 while(lastInChrBuf < lastChrSrcY)
3085 {
3086 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3087 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3088 chrBufIndex++;
3089 assert(chrBufIndex < 2*vChrBufSize);
3090 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3091 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3092 //FIXME replace parameters through context struct (some at least)
3093
3094 if (!(isGray(srcFormat) || isGray(dstFormat)))
3095 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3096 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3097 funnyUVCode, c->srcFormat, formatConvBuffer,
3098 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3099 lastInChrBuf++;
3100 }
3101 //wrap buf index around to stay inside the ring buffer
3102 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3103 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3104 }
3105 else // not enough lines left in this slice -> load the rest in the buffer
3106 {
3107 /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3108 firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3109 lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3110 vChrBufSize, vLumBufSize);*/
3111
3112 //Do horizontal scaling
3113 while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3114 {
3115 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3116 lumBufIndex++;
3117 assert(lumBufIndex < 2*vLumBufSize);
3118 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3119 assert(lastInLumBuf + 1 - srcSliceY >= 0);
3120 RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3121 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3122 funnyYCode, c->srcFormat, formatConvBuffer,
3123 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3124 lastInLumBuf++;
3125 }
3126 while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3127 {
3128 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3129 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3130 chrBufIndex++;
3131 assert(chrBufIndex < 2*vChrBufSize);
3132 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3133 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3134
3135 if (!(isGray(srcFormat) || isGray(dstFormat)))
3136 RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3137 flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3138 funnyUVCode, c->srcFormat, formatConvBuffer,
3139 c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3140 lastInChrBuf++;
3141 }
3142 //wrap buf index around to stay inside the ring buffer
3143 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3144 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3145 break; //we can't output a dstY line so let's try with the next slice
3146 }
3147
3148#ifdef HAVE_MMX
3149 b5Dither= ff_dither8[dstY&1];
3150 g6Dither= ff_dither4[dstY&1];
3151 g5Dither= ff_dither8[dstY&1];
3152 r5Dither= ff_dither8[(dstY+1)&1];
3153#endif
3154 if (dstY < dstH-2)
3155 {
3156 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3157 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3158#ifdef HAVE_MMX
3159 int i;
3160 if (flags & SWS_ACCURATE_RND){
3161 for (i=0; i<vLumFilterSize; i+=2){
3162 lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ];
3163 lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3164 lumMmxFilter[2*i+2]=
3165 lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ]
3166 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3167 }
3168 for (i=0; i<vChrFilterSize; i+=2){
3169 chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ];
3170 chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3171 chrMmxFilter[2*i+2]=
3172 chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ]
3173 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3174 }
3175 }else{
3176 for (i=0; i<vLumFilterSize; i++)
3177 {
3178 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3179 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3180 lumMmxFilter[4*i+2]=
3181 lumMmxFilter[4*i+3]=
3182 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3183 }
3184 for (i=0; i<vChrFilterSize; i++)
3185 {
3186 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3187 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3188 chrMmxFilter[4*i+2]=
3189 chrMmxFilter[4*i+3]=
3190 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3191 }
3192 }
3193#endif
3194 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3195 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3196 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3197 RENAME(yuv2nv12X)(c,
3198 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3199 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3200 dest, uDest, dstW, chrDstW, dstFormat);
3201 }
3202 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3203 {
3204 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3205 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3206 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3207 {
3208 int16_t *lumBuf = lumPixBuf[0];
3209 int16_t *chrBuf= chrPixBuf[0];
3210 RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3211 }
3212 else //General YV12
3213 {
3214 RENAME(yuv2yuvX)(c,
3215 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3216 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3217 dest, uDest, vDest, dstW, chrDstW);
3218 }
3219 }
3220 else
3221 {
3222 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3223 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3224 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3225 {
3226 int chrAlpha= vChrFilter[2*dstY+1];
3227 RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3228 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3229 }
3230 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3231 {
3232 int lumAlpha= vLumFilter[2*dstY+1];
3233 int chrAlpha= vChrFilter[2*dstY+1];
3234 lumMmxFilter[2]=
3235 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
3236 chrMmxFilter[2]=
3237 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3238 RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3239 dest, dstW, lumAlpha, chrAlpha, dstY);
3240 }
3241 else //general RGB
3242 {
3243 RENAME(yuv2packedX)(c,
3244 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3245 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3246 dest, dstW, dstY);
3247 }
3248 }
3249 }
3250 else // hmm looks like we can't use MMX here without overwriting this array's tail
3251 {
3252 int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3253 int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3254 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3255 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3256 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3257 yuv2nv12XinC(
3258 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3259 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3260 dest, uDest, dstW, chrDstW, dstFormat);
3261 }
3262 else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3263 {
3264 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3265 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3266 yuv2yuvXinC(
3267 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
3268 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3269 dest, uDest, vDest, dstW, chrDstW);
3270 }
3271 else
3272 {
3273 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3274 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3275 yuv2packedXinC(c,
3276 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3277 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3278 dest, dstW, dstY);
3279 }
3280 }
3281 }
3282
3283#ifdef HAVE_MMX
3284 asm volatile(SFENCE:::"memory");
3285 asm volatile(EMMS:::"memory");
3286#endif
3287 /* store changed local vars back in the context */
3288 c->dstY= dstY;
3289 c->lumBufIndex= lumBufIndex;
3290 c->chrBufIndex= chrBufIndex;
3291 c->lastInLumBuf= lastInLumBuf;
3292 c->lastInChrBuf= lastInChrBuf;
3293
3294 return dstY - lastDstY;
3295}
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb.c b/src/plugins/ffmpeg/libswscale/yuv2rgb.c
deleted file mode 100644
index f0613a8..0000000
--- a/src/plugins/ffmpeg/libswscale/yuv2rgb.c
+++ /dev/null
@@ -1,887 +0,0 @@
1/*
2 * yuv2rgb.c, Software YUV to RGB converter
3 *
4 * Copyright (C) 1999, Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
5 *
6 * Functions broken out from display_x11.c and several new modes
7 * added by HÃ¥kan Hjort <d95hjort@dtek.chalmers.se>
8 *
9 * 15 & 16 bpp support by Franck Sicard <Franck.Sicard@solsoft.fr>
10 *
11 * MMX/MMX2 template stuff (needed for fast movntq support),
12 * 1,4,8bpp support and context / deglobalize stuff
13 * by Michael Niedermayer (michaelni@gmx.at)
14 *
15 * This file is part of mpeg2dec, a free MPEG-2 video decoder
16 *
17 * mpeg2dec is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2, or (at your option)
20 * any later version.
21 *
22 * mpeg2dec is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License
28 * along with mpeg2dec; if not, write to the Free Software
29 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 */
31
32#include <stdio.h>
33#include <stdlib.h>
34#include <inttypes.h>
35#include <assert.h>
36
37#include "config.h"
38#include "rgb2rgb.h"
39#include "swscale.h"
40#include "swscale_internal.h"
41
42#define DITHER1XBPP // only for MMX
43
44const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
45{ 1, 3, 1, 3, 1, 3, 1, 3, },
46{ 2, 0, 2, 0, 2, 0, 2, 0, },
47};
48
49const uint8_t __attribute__((aligned(8))) dither_2x2_8[2][8]={
50{ 6, 2, 6, 2, 6, 2, 6, 2, },
51{ 0, 4, 0, 4, 0, 4, 0, 4, },
52};
53
54const uint8_t __attribute__((aligned(8))) dither_8x8_32[8][8]={
55{ 17, 9, 23, 15, 16, 8, 22, 14, },
56{ 5, 29, 3, 27, 4, 28, 2, 26, },
57{ 21, 13, 19, 11, 20, 12, 18, 10, },
58{ 0, 24, 6, 30, 1, 25, 7, 31, },
59{ 16, 8, 22, 14, 17, 9, 23, 15, },
60{ 4, 28, 2, 26, 5, 29, 3, 27, },
61{ 20, 12, 18, 10, 21, 13, 19, 11, },
62{ 1, 25, 7, 31, 0, 24, 6, 30, },
63};
64
65#if 0
66const uint8_t __attribute__((aligned(8))) dither_8x8_64[8][8]={
67{ 0, 48, 12, 60, 3, 51, 15, 63, },
68{ 32, 16, 44, 28, 35, 19, 47, 31, },
69{ 8, 56, 4, 52, 11, 59, 7, 55, },
70{ 40, 24, 36, 20, 43, 27, 39, 23, },
71{ 2, 50, 14, 62, 1, 49, 13, 61, },
72{ 34, 18, 46, 30, 33, 17, 45, 29, },
73{ 10, 58, 6, 54, 9, 57, 5, 53, },
74{ 42, 26, 38, 22, 41, 25, 37, 21, },
75};
76#endif
77
78const uint8_t __attribute__((aligned(8))) dither_8x8_73[8][8]={
79{ 0, 55, 14, 68, 3, 58, 17, 72, },
80{ 37, 18, 50, 32, 40, 22, 54, 35, },
81{ 9, 64, 5, 59, 13, 67, 8, 63, },
82{ 46, 27, 41, 23, 49, 31, 44, 26, },
83{ 2, 57, 16, 71, 1, 56, 15, 70, },
84{ 39, 21, 52, 34, 38, 19, 51, 33, },
85{ 11, 66, 7, 62, 10, 65, 6, 60, },
86{ 48, 30, 43, 25, 47, 29, 42, 24, },
87};
88
89#if 0
90const uint8_t __attribute__((aligned(8))) dither_8x8_128[8][8]={
91{ 68, 36, 92, 60, 66, 34, 90, 58, },
92{ 20, 116, 12, 108, 18, 114, 10, 106, },
93{ 84, 52, 76, 44, 82, 50, 74, 42, },
94{ 0, 96, 24, 120, 6, 102, 30, 126, },
95{ 64, 32, 88, 56, 70, 38, 94, 62, },
96{ 16, 112, 8, 104, 22, 118, 14, 110, },
97{ 80, 48, 72, 40, 86, 54, 78, 46, },
98{ 4, 100, 28, 124, 2, 98, 26, 122, },
99};
100#endif
101
102#if 1
103const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
104{117, 62, 158, 103, 113, 58, 155, 100, },
105{ 34, 199, 21, 186, 31, 196, 17, 182, },
106{144, 89, 131, 76, 141, 86, 127, 72, },
107{ 0, 165, 41, 206, 10, 175, 52, 217, },
108{110, 55, 151, 96, 120, 65, 162, 107, },
109{ 28, 193, 14, 179, 38, 203, 24, 189, },
110{138, 83, 124, 69, 148, 93, 134, 79, },
111{ 7, 172, 48, 213, 3, 168, 45, 210, },
112};
113#elif 1
114// tries to correct a gamma of 1.5
115const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
116{ 0, 143, 18, 200, 2, 156, 25, 215, },
117{ 78, 28, 125, 64, 89, 36, 138, 74, },
118{ 10, 180, 3, 161, 16, 195, 8, 175, },
119{109, 51, 93, 38, 121, 60, 105, 47, },
120{ 1, 152, 23, 210, 0, 147, 20, 205, },
121{ 85, 33, 134, 71, 81, 30, 130, 67, },
122{ 14, 190, 6, 171, 12, 185, 5, 166, },
123{117, 57, 101, 44, 113, 54, 97, 41, },
124};
125#elif 1
126// tries to correct a gamma of 2.0
127const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
128{ 0, 124, 8, 193, 0, 140, 12, 213, },
129{ 55, 14, 104, 42, 66, 19, 119, 52, },
130{ 3, 168, 1, 145, 6, 187, 3, 162, },
131{ 86, 31, 70, 21, 99, 39, 82, 28, },
132{ 0, 134, 11, 206, 0, 129, 9, 200, },
133{ 62, 17, 114, 48, 58, 16, 109, 45, },
134{ 5, 181, 2, 157, 4, 175, 1, 151, },
135{ 95, 36, 78, 26, 90, 34, 74, 24, },
136};
137#else
138// tries to correct a gamma of 2.5
139const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={
140{ 0, 107, 3, 187, 0, 125, 6, 212, },
141{ 39, 7, 86, 28, 49, 11, 102, 36, },
142{ 1, 158, 0, 131, 3, 180, 1, 151, },
143{ 68, 19, 52, 12, 81, 25, 64, 17, },
144{ 0, 119, 5, 203, 0, 113, 4, 195, },
145{ 45, 9, 96, 33, 42, 8, 91, 30, },
146{ 2, 172, 1, 144, 2, 165, 0, 137, },
147{ 77, 23, 60, 15, 72, 21, 56, 14, },
148};
149#endif
150
151#ifdef HAVE_MMX
152
153/* hope these constant values are cache line aligned */
154DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
155DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
156DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
157
158// The volatile is required because gcc otherwise optimizes some writes away
159// not knowing that these are read in the ASM block.
160static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
161static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
162static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
163static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
164
165#undef HAVE_MMX
166
167//MMX versions
168#undef RENAME
169#define HAVE_MMX
170#undef HAVE_MMX2
171#undef HAVE_3DNOW
172#define RENAME(a) a ## _MMX
173#include "yuv2rgb_template.c"
174
175//MMX2 versions
176#undef RENAME
177#define HAVE_MMX
178#define HAVE_MMX2
179#undef HAVE_3DNOW
180#define RENAME(a) a ## _MMX2
181#include "yuv2rgb_template.c"
182
183#endif /* HAVE_MMX */
184
185const int32_t Inverse_Table_6_9[8][4] = {
186 {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
187 {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
188 {104597, 132201, 25675, 53279}, /* unspecified */
189 {104597, 132201, 25675, 53279}, /* reserved */
190 {104448, 132798, 24759, 53109}, /* FCC */
191 {104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */
192 {104597, 132201, 25675, 53279}, /* SMPTE 170M */
193 {117579, 136230, 16907, 35559} /* SMPTE 240M (1987) */
194};
195
196#define RGB(i) \
197 U = pu[i]; \
198 V = pv[i]; \
199 r = (void *)c->table_rV[V]; \
200 g = (void *)(c->table_gU[U] + c->table_gV[V]); \
201 b = (void *)c->table_bU[U];
202
203#define DST1(i) \
204 Y = py_1[2*i]; \
205 dst_1[2*i] = r[Y] + g[Y] + b[Y]; \
206 Y = py_1[2*i+1]; \
207 dst_1[2*i+1] = r[Y] + g[Y] + b[Y];
208
209#define DST2(i) \
210 Y = py_2[2*i]; \
211 dst_2[2*i] = r[Y] + g[Y] + b[Y]; \
212 Y = py_2[2*i+1]; \
213 dst_2[2*i+1] = r[Y] + g[Y] + b[Y];
214
215#define DST1RGB(i) \
216 Y = py_1[2*i]; \
217 dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \
218 Y = py_1[2*i+1]; \
219 dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y];
220
221#define DST2RGB(i) \
222 Y = py_2[2*i]; \
223 dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \
224 Y = py_2[2*i+1]; \
225 dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y];
226
227#define DST1BGR(i) \
228 Y = py_1[2*i]; \
229 dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \
230 Y = py_1[2*i+1]; \
231 dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y];
232
233#define DST2BGR(i) \
234 Y = py_2[2*i]; \
235 dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \
236 Y = py_2[2*i+1]; \
237 dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
238
239#define PROLOG(func_name, dst_type) \
240static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
241 int srcSliceH, uint8_t* dst[], int dstStride[]){\
242 int y;\
243\
244 if (c->srcFormat == PIX_FMT_YUV422P){\
245 srcStride[1] *= 2;\
246 srcStride[2] *= 2;\
247 }\
248 for (y=0; y<srcSliceH; y+=2){\
249 dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY )*dstStride[0]);\
250 dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
251 dst_type av_unused *r, *b;\
252 dst_type *g;\
253 uint8_t *py_1= src[0] + y*srcStride[0];\
254 uint8_t *py_2= py_1 + srcStride[0];\
255 uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
256 uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
257 unsigned int h_size= c->dstW>>3;\
258 while (h_size--) {\
259 int av_unused U, V;\
260 int Y;\
261
262#define EPILOG1(dst_delta)\
263 pu += 4;\
264 pv += 4;\
265 py_1 += 8;\
266 py_2 += 8;\
267 dst_1 += dst_delta;\
268 dst_2 += dst_delta;\
269 }\
270 if (c->dstW & 4) {\
271 int av_unused Y, U, V;\
272
273#define EPILOG2()\
274 }\
275 }\
276 return srcSliceH;\
277}
278
279#define EPILOG(dst_delta)\
280 EPILOG1(dst_delta)\
281 EPILOG2()
282
283PROLOG(yuv2rgb_c_32, uint32_t)
284 RGB(0);
285 DST1(0);
286 DST2(0);
287
288 RGB(1);
289 DST2(1);
290 DST1(1);
291
292 RGB(2);
293 DST1(2);
294 DST2(2);
295
296 RGB(3);
297 DST2(3);
298 DST1(3);
299EPILOG1(8)
300 RGB(0);
301 DST1(0);
302 DST2(0);
303
304 RGB(1);
305 DST2(1);
306 DST1(1);
307EPILOG2()
308
309PROLOG(yuv2rgb_c_24_rgb, uint8_t)
310 RGB(0);
311 DST1RGB(0);
312 DST2RGB(0);
313
314 RGB(1);
315 DST2RGB(1);
316 DST1RGB(1);
317
318 RGB(2);
319 DST1RGB(2);
320 DST2RGB(2);
321
322 RGB(3);
323 DST2RGB(3);
324 DST1RGB(3);
325EPILOG1(24)
326 RGB(0);
327 DST1RGB(0);
328 DST2RGB(0);
329
330 RGB(1);
331 DST2RGB(1);
332 DST1RGB(1);
333EPILOG2()
334
335// only trivial mods from yuv2rgb_c_24_rgb
336PROLOG(yuv2rgb_c_24_bgr, uint8_t)
337 RGB(0);
338 DST1BGR(0);
339 DST2BGR(0);
340
341 RGB(1);
342 DST2BGR(1);
343 DST1BGR(1);
344
345 RGB(2);
346 DST1BGR(2);
347 DST2BGR(2);
348
349 RGB(3);
350 DST2BGR(3);
351 DST1BGR(3);
352EPILOG1(24)
353 RGB(0);
354 DST1BGR(0);
355 DST2BGR(0);
356
357 RGB(1);
358 DST2BGR(1);
359 DST1BGR(1);
360EPILOG2()
361
362// This is exactly the same code as yuv2rgb_c_32 except for the types of
363// r, g, b, dst_1, dst_2
364PROLOG(yuv2rgb_c_16, uint16_t)
365 RGB(0);
366 DST1(0);
367 DST2(0);
368
369 RGB(1);
370 DST2(1);
371 DST1(1);
372
373 RGB(2);
374 DST1(2);
375 DST2(2);
376
377 RGB(3);
378 DST2(3);
379 DST1(3);
380EPILOG(8)
381
382#if HAVE_DEAD_CODE
383// This is exactly the same code as yuv2rgb_c_32 except for the types of
384// r, g, b, dst_1, dst_2
385PROLOG(yuv2rgb_c_8, uint8_t)
386 RGB(0);
387 DST1(0);
388 DST2(0);
389
390 RGB(1);
391 DST2(1);
392 DST1(1);
393
394 RGB(2);
395 DST1(2);
396 DST2(2);
397
398 RGB(3);
399 DST2(3);
400 DST1(3);
401EPILOG(8)
402#endif
403
404// r, g, b, dst_1, dst_2
405PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
406 const uint8_t *d32= dither_8x8_32[y&7];
407 const uint8_t *d64= dither_8x8_73[y&7];
408#define DST1bpp8(i,o) \
409 Y = py_1[2*i]; \
410 dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \
411 Y = py_1[2*i+1]; \
412 dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];
413
414#define DST2bpp8(i,o) \
415 Y = py_2[2*i]; \
416 dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \
417 Y = py_2[2*i+1]; \
418 dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]];
419
420
421 RGB(0);
422 DST1bpp8(0,0);
423 DST2bpp8(0,0);
424
425 RGB(1);
426 DST2bpp8(1,2);
427 DST1bpp8(1,2);
428
429 RGB(2);
430 DST1bpp8(2,4);
431 DST2bpp8(2,4);
432
433 RGB(3);
434 DST2bpp8(3,6);
435 DST1bpp8(3,6);
436EPILOG(8)
437
438
439// This is exactly the same code as yuv2rgb_c_32 except for the types of
440// r, g, b, dst_1, dst_2
441#if HAVE_DEAD_CODE
442PROLOG(yuv2rgb_c_4, uint8_t)
443 int acc;
444#define DST1_4(i) \
445 Y = py_1[2*i]; \
446 acc = r[Y] + g[Y] + b[Y]; \
447 Y = py_1[2*i+1]; \
448 acc |= (r[Y] + g[Y] + b[Y])<<4; \
449 dst_1[i] = acc;
450
451#define DST2_4(i) \
452 Y = py_2[2*i]; \
453 acc = r[Y] + g[Y] + b[Y]; \
454 Y = py_2[2*i+1]; \
455 acc |= (r[Y] + g[Y] + b[Y])<<4; \
456 dst_2[i] = acc;
457
458 RGB(0);
459 DST1_4(0);
460 DST2_4(0);
461
462 RGB(1);
463 DST2_4(1);
464 DST1_4(1);
465
466 RGB(2);
467 DST1_4(2);
468 DST2_4(2);
469
470 RGB(3);
471 DST2_4(3);
472 DST1_4(3);
473EPILOG(4)
474#endif
475
476PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t)
477 const uint8_t *d64= dither_8x8_73[y&7];
478 const uint8_t *d128=dither_8x8_220[y&7];
479 int acc;
480
481#define DST1bpp4(i,o) \
482 Y = py_1[2*i]; \
483 acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
484 Y = py_1[2*i+1]; \
485 acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4; \
486 dst_1[i]= acc;
487
488#define DST2bpp4(i,o) \
489 Y = py_2[2*i]; \
490 acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
491 Y = py_2[2*i+1]; \
492 acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4; \
493 dst_2[i]= acc;
494
495
496 RGB(0);
497 DST1bpp4(0,0);
498 DST2bpp4(0,0);
499
500 RGB(1);
501 DST2bpp4(1,2);
502 DST1bpp4(1,2);
503
504 RGB(2);
505 DST1bpp4(2,4);
506 DST2bpp4(2,4);
507
508 RGB(3);
509 DST2bpp4(3,6);
510 DST1bpp4(3,6);
511EPILOG(4)
512
513// This is exactly the same code as yuv2rgb_c_32 except for the types of
514// r, g, b, dst_1, dst_2
515#if HAVE_DEAD_CODE
516PROLOG(yuv2rgb_c_4b, uint8_t)
517 RGB(0);
518 DST1(0);
519 DST2(0);
520
521 RGB(1);
522 DST2(1);
523 DST1(1);
524
525 RGB(2);
526 DST1(2);
527 DST2(2);
528
529 RGB(3);
530 DST2(3);
531 DST1(3);
532EPILOG(8)
533#endif
534
535PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t)
536 const uint8_t *d64= dither_8x8_73[y&7];
537 const uint8_t *d128=dither_8x8_220[y&7];
538
539#define DST1bpp4b(i,o) \
540 Y = py_1[2*i]; \
541 dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
542 Y = py_1[2*i+1]; \
543 dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];
544
545#define DST2bpp4b(i,o) \
546 Y = py_2[2*i]; \
547 dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \
548 Y = py_2[2*i+1]; \
549 dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]];
550
551
552 RGB(0);
553 DST1bpp4b(0,0);
554 DST2bpp4b(0,0);
555
556 RGB(1);
557 DST2bpp4b(1,2);
558 DST1bpp4b(1,2);
559
560 RGB(2);
561 DST1bpp4b(2,4);
562 DST2bpp4b(2,4);
563
564 RGB(3);
565 DST2bpp4b(3,6);
566 DST1bpp4b(3,6);
567EPILOG(8)
568
569PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t)
570 const uint8_t *d128=dither_8x8_220[y&7];
571 char out_1=0, out_2=0;
572 g= c->table_gU[128] + c->table_gV[128];
573
574#define DST1bpp1(i,o) \
575 Y = py_1[2*i]; \
576 out_1+= out_1 + g[Y+d128[0+o]]; \
577 Y = py_1[2*i+1]; \
578 out_1+= out_1 + g[Y+d128[1+o]];
579
580#define DST2bpp1(i,o) \
581 Y = py_2[2*i]; \
582 out_2+= out_2 + g[Y+d128[8+o]]; \
583 Y = py_2[2*i+1]; \
584 out_2+= out_2 + g[Y+d128[9+o]];
585
586 DST1bpp1(0,0);
587 DST2bpp1(0,0);
588
589 DST2bpp1(1,2);
590 DST1bpp1(1,2);
591
592 DST1bpp1(2,4);
593 DST2bpp1(2,4);
594
595 DST2bpp1(3,6);
596 DST1bpp1(3,6);
597
598 dst_1[0]= out_1;
599 dst_2[0]= out_2;
600EPILOG(1)
601
602SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
603{
604#if defined(HAVE_MMX2) || defined(HAVE_MMX)
605 if (c->flags & SWS_CPU_CAPS_MMX2){
606 switch(c->dstFormat){
607 case PIX_FMT_RGB32: return yuv420_rgb32_MMX2;
608 case PIX_FMT_BGR24: return yuv420_rgb24_MMX2;
609 case PIX_FMT_BGR565: return yuv420_rgb16_MMX2;
610 case PIX_FMT_BGR555: return yuv420_rgb15_MMX2;
611 }
612 }
613 if (c->flags & SWS_CPU_CAPS_MMX){
614 switch(c->dstFormat){
615 case PIX_FMT_RGB32: return yuv420_rgb32_MMX;
616 case PIX_FMT_BGR24: return yuv420_rgb24_MMX;
617 case PIX_FMT_BGR565: return yuv420_rgb16_MMX;
618 case PIX_FMT_BGR555: return yuv420_rgb15_MMX;
619 }
620 }
621#endif
622#ifdef HAVE_VIS
623 {
624 SwsFunc t= yuv2rgb_init_vis(c);
625 if (t) return t;
626 }
627#endif
628#ifdef CONFIG_MLIB
629 {
630 SwsFunc t= yuv2rgb_init_mlib(c);
631 if (t) return t;
632 }
633#endif
634#ifdef HAVE_ALTIVEC
635 if (c->flags & SWS_CPU_CAPS_ALTIVEC)
636 {
637 SwsFunc t = yuv2rgb_init_altivec(c);
638 if (t) return t;
639 }
640#endif
641
642#ifdef ARCH_BFIN
643 if (c->flags & SWS_CPU_CAPS_BFIN)
644 {
645 SwsFunc t = ff_bfin_yuv2rgb_get_func_ptr (c);
646 if (t) return t;
647 }
648#endif
649
650 av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
651
652 switch(c->dstFormat){
653 case PIX_FMT_BGR32:
654 case PIX_FMT_RGB32: return yuv2rgb_c_32;
655 case PIX_FMT_RGB24: return yuv2rgb_c_24_rgb;
656 case PIX_FMT_BGR24: return yuv2rgb_c_24_bgr;
657 case PIX_FMT_RGB565:
658 case PIX_FMT_BGR565:
659 case PIX_FMT_RGB555:
660 case PIX_FMT_BGR555: return yuv2rgb_c_16;
661 case PIX_FMT_RGB8:
662 case PIX_FMT_BGR8: return yuv2rgb_c_8_ordered_dither;
663 case PIX_FMT_RGB4:
664 case PIX_FMT_BGR4: return yuv2rgb_c_4_ordered_dither;
665 case PIX_FMT_RGB4_BYTE:
666 case PIX_FMT_BGR4_BYTE: return yuv2rgb_c_4b_ordered_dither;
667 case PIX_FMT_MONOBLACK: return yuv2rgb_c_1_ordered_dither;
668 default:
669 assert(0);
670 }
671 return NULL;
672}
673
674static int div_round (int dividend, int divisor)
675{
676 if (dividend > 0)
677 return (dividend + (divisor>>1)) / divisor;
678 else
679 return -((-dividend + (divisor>>1)) / divisor);
680}
681
682int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation)
683{
684 const int isRgb = isBGR(c->dstFormat);
685 const int bpp = fmt_depth(c->dstFormat);
686 int i;
687 uint8_t table_Y[1024];
688 uint32_t *table_32 = 0;
689 uint16_t *table_16 = 0;
690 uint8_t *table_8 = 0;
691 uint8_t *table_332 = 0;
692 uint8_t *table_121 = 0;
693 uint8_t *table_1 = 0;
694 int entry_size = 0;
695 void *table_r = 0, *table_g = 0, *table_b = 0;
696 void *table_start;
697
698 int64_t crv = inv_table[0];
699 int64_t cbu = inv_table[1];
700 int64_t cgu = -inv_table[2];
701 int64_t cgv = -inv_table[3];
702 int64_t cy = 1<<16;
703 int64_t oy = 0;
704
705//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
706 if (!fullRange){
707 cy= (cy*255) / 219;
708 oy= 16<<16;
709 }else{
710 crv= (crv*224) / 255;
711 cbu= (cbu*224) / 255;
712 cgu= (cgu*224) / 255;
713 cgv= (cgv*224) / 255;
714 }
715
716 cy = (cy *contrast )>>16;
717 crv= (crv*contrast * saturation)>>32;
718 cbu= (cbu*contrast * saturation)>>32;
719 cgu= (cgu*contrast * saturation)>>32;
720 cgv= (cgv*contrast * saturation)>>32;
721//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
722 oy -= 256*brightness;
723
724 for (i = 0; i < 1024; i++) {
725 int j;
726
727 j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32;
728 j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
729 table_Y[i] = j;
730 }
731
732 switch (bpp) {
733 case 32:
734 table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));
735
736 entry_size = sizeof (uint32_t);
737 table_r = table_32 + 197;
738 table_b = table_32 + 197 + 685;
739 table_g = table_32 + 197 + 2*682;
740
741 for (i = -197; i < 256+197; i++)
742 ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0);
743 for (i = -132; i < 256+132; i++)
744 ((uint32_t *)table_g)[i] = table_Y[i+384] << 8;
745 for (i = -232; i < 256+232; i++)
746 ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16);
747 break;
748
749 case 24:
750 table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t));
751
752 entry_size = sizeof (uint8_t);
753 table_r = table_g = table_b = table_8 + 232;
754
755 for (i = -232; i < 256+232; i++)
756 ((uint8_t * )table_b)[i] = table_Y[i+384];
757 break;
758
759 case 15:
760 case 16:
761 table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
762
763 entry_size = sizeof (uint16_t);
764 table_r = table_16 + 197;
765 table_b = table_16 + 197 + 685;
766 table_g = table_16 + 197 + 2*682;
767
768 for (i = -197; i < 256+197; i++) {
769 int j = table_Y[i+384] >> 3;
770
771 if (isRgb)
772 j <<= ((bpp==16) ? 11 : 10);
773
774 ((uint16_t *)table_r)[i] = j;
775 }
776 for (i = -132; i < 256+132; i++) {
777 int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
778
779 ((uint16_t *)table_g)[i] = j << 5;
780 }
781 for (i = -232; i < 256+232; i++) {
782 int j = table_Y[i+384] >> 3;
783
784 if (!isRgb)
785 j <<= ((bpp==16) ? 11 : 10);
786
787 ((uint16_t *)table_b)[i] = j;
788 }
789 break;
790
791 case 8:
792 table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
793
794 entry_size = sizeof (uint8_t);
795 table_r = table_332 + 197;
796 table_b = table_332 + 197 + 685;
797 table_g = table_332 + 197 + 2*682;
798
799 for (i = -197; i < 256+197; i++) {
800 int j = (table_Y[i+384 - 16] + 18)/36;
801
802 if (isRgb)
803 j <<= 5;
804
805 ((uint8_t *)table_r)[i] = j;
806 }
807 for (i = -132; i < 256+132; i++) {
808 int j = (table_Y[i+384 - 16] + 18)/36;
809
810 if (!isRgb)
811 j <<= 1;
812
813 ((uint8_t *)table_g)[i] = j << 2;
814 }
815 for (i = -232; i < 256+232; i++) {
816 int j = (table_Y[i+384 - 37] + 43)/85;
817
818 if (!isRgb)
819 j <<= 6;
820
821 ((uint8_t *)table_b)[i] = j;
822 }
823 break;
824 case 4:
825 case 4|128:
826 table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
827
828 entry_size = sizeof (uint8_t);
829 table_r = table_121 + 197;
830 table_b = table_121 + 197 + 685;
831 table_g = table_121 + 197 + 2*682;
832
833 for (i = -197; i < 256+197; i++) {
834 int j = table_Y[i+384 - 110] >> 7;
835
836 if (isRgb)
837 j <<= 3;
838
839 ((uint8_t *)table_r)[i] = j;
840 }
841 for (i = -132; i < 256+132; i++) {
842 int j = (table_Y[i+384 - 37]+ 43)/85;
843
844 ((uint8_t *)table_g)[i] = j << 1;
845 }
846 for (i = -232; i < 256+232; i++) {
847 int j =table_Y[i+384 - 110] >> 7;
848
849 if (!isRgb)
850 j <<= 3;
851
852 ((uint8_t *)table_b)[i] = j;
853 }
854 break;
855
856 case 1:
857 table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t));
858
859 entry_size = sizeof (uint8_t);
860 table_g = table_1;
861 table_r = table_b = NULL;
862
863 for (i = 0; i < 256+256; i++) {
864 int j = table_Y[i + 384 - 110]>>7;
865
866 ((uint8_t *)table_g)[i] = j;
867 }
868 break;
869
870 default:
871 table_start= NULL;
872 av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
873 //free mem?
874 return -1;
875 }
876
877 for (i = 0; i < 256; i++) {
878 c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309);
879 c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309);
880 c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
881 c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309);
882 }
883
884 av_free(c->yuvTable);
885 c->yuvTable= table_start;
886 return 0;
887}
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c
deleted file mode 100644
index 13b18d1..0000000
--- a/src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c
+++ /dev/null
@@ -1,965 +0,0 @@
1/*
2 * AltiVec acceleration for colorspace conversion
3 *
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23/*
24Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28Reject defers to C simulation code.
29
30Lots of optimizations to be done here.
31
321. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
34
352. The inefficient use of chroma loading needs a bit of brushing up.
36
373. Analysis of pipeline stalls needs to be done. Use shark to identify
38 pipeline stalls.
39
40
41MODIFIED to calculate coeffs from currently selected color space.
42MODIFIED core to be a macro where you specify the output format.
43ADDED UYVY conversion which is never called due to some thing in swscale.
44CORRECTED algorithim selection to be strict on input formats.
45ADDED runtime detection of AltiVec.
46
47ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49March 27,2004
50PERFORMANCE ANALYSIS
51
52The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53used as test.
54The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55same sequence.
56
57720 * 480 * 30 ~10MPS
58
59so we have roughly 10 clocks per pixel. This is too high, something has
60to be wrong.
61
62OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63need for vec_min.
64
65OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66the input video frame, it was just decompressed so it probably resides in L1
67caches. However, we are creating the output video stream. This needs to use the
68DSTST instruction to optimize for the cache. We couple this with the fact that
69we are not going to be visiting the input buffer again so we mark it Least
70Recently Used. This shaves 25% of the processor cycles off.
71
72Now memcpy is the largest mips consumer in the system, probably due
73to the inefficient X11 stuff.
74
75GL libraries seem to be very slow on this machine 1.33Ghz PB running
76Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77a versioning issue, however I have libGL.1.2.dylib for both
78machines. (We need to figure this out now.)
79
80GL2 libraries work now with patch for RGB32.
81
82NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84Integrated luma prescaling adjustment for saturation/contrast/brightness
85adjustment.
86*/
87
88#include <stdio.h>
89#include <stdlib.h>
90#include <string.h>
91#include <inttypes.h>
92#include <assert.h>
93#include "config.h"
94#ifdef HAVE_MALLOC_H
95#include <malloc.h>
96#endif
97#include "rgb2rgb.h"
98#include "swscale.h"
99#include "swscale_internal.h"
100
101#undef PROFILE_THE_BEAST
102#undef INC_SCALING
103
104typedef unsigned char ubyte;
105typedef signed char sbyte;
106
107
108/* RGB interleaver, 16 planar pels 8-bit samples per channel in
109 homogeneous vector registers x0,x1,x2 are interleaved with the
110 following technique:
111
112 o0 = vec_mergeh (x0,x1);
113 o1 = vec_perm (o0, x2, perm_rgb_0);
114 o2 = vec_perm (o0, x2, perm_rgb_1);
115 o3 = vec_mergel (x0,x1);
116 o4 = vec_perm (o3,o2,perm_rgb_2);
117 o5 = vec_perm (o3,o2,perm_rgb_3);
118
119 perm_rgb_0: o0(RG).h v1(B) --> o1*
120 0 1 2 3 4
121 rgbr|gbrg|brgb|rgbr
122 0010 0100 1001 0010
123 0102 3145 2673 894A
124
125 perm_rgb_1: o0(RG).h v1(B) --> o2
126 0 1 2 3 4
127 gbrg|brgb|bbbb|bbbb
128 0100 1001 1111 1111
129 B5CD 6EF7 89AB CDEF
130
131 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
132 0 1 2 3 4
133 gbrg|brgb|rgbr|gbrg
134 1111 1111 0010 0100
135 89AB CDEF 0182 3945
136
137 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
138 0 1 2 3 4
139 brgb|rgbr|gbrg|brgb
140 1001 0010 0100 1001
141 a67b 89cA BdCD eEFf
142
143*/
144static
145const vector unsigned char
146 perm_rgb_0 = AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
147 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
148 perm_rgb_1 = AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
149 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
150 perm_rgb_2 = AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
151 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
152 perm_rgb_3 = AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
153 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
154
155#define vec_merge3(x2,x1,x0,y0,y1,y2) \
156do { \
157 typeof(x0) o0,o2,o3; \
158 o0 = vec_mergeh (x0,x1); \
159 y0 = vec_perm (o0, x2, perm_rgb_0); \
160 o2 = vec_perm (o0, x2, perm_rgb_1); \
161 o3 = vec_mergel (x0,x1); \
162 y1 = vec_perm (o3,o2,perm_rgb_2); \
163 y2 = vec_perm (o3,o2,perm_rgb_3); \
164} while(0)
165
166#define vec_mstbgr24(x0,x1,x2,ptr) \
167do { \
168 typeof(x0) _0,_1,_2; \
169 vec_merge3 (x0,x1,x2,_0,_1,_2); \
170 vec_st (_0, 0, ptr++); \
171 vec_st (_1, 0, ptr++); \
172 vec_st (_2, 0, ptr++); \
173} while (0);
174
175#define vec_mstrgb24(x0,x1,x2,ptr) \
176do { \
177 typeof(x0) _0,_1,_2; \
178 vec_merge3 (x2,x1,x0,_0,_1,_2); \
179 vec_st (_0, 0, ptr++); \
180 vec_st (_1, 0, ptr++); \
181 vec_st (_2, 0, ptr++); \
182} while (0);
183
184/* pack the pixels in rgb0 format
185 msb R
186 lsb 0
187*/
188#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
189do { \
190 T _0,_1,_2,_3; \
191 _0 = vec_mergeh (x0,x1); \
192 _1 = vec_mergeh (x2,x3); \
193 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
194 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
195 vec_st (_2, 0*16, (T *)ptr); \
196 vec_st (_3, 1*16, (T *)ptr); \
197 _0 = vec_mergel (x0,x1); \
198 _1 = vec_mergel (x2,x3); \
199 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
200 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
201 vec_st (_2, 2*16, (T *)ptr); \
202 vec_st (_3, 3*16, (T *)ptr); \
203 ptr += 4; \
204} while (0);
205
206/*
207
208 | 1 0 1.4021 | | Y |
209 | 1 -0.3441 -0.7142 |x| Cb|
210 | 1 1.7718 0 | | Cr|
211
212
213 Y: [-128 127]
214 Cb/Cr : [-128 127]
215
216 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
217
218*/
219
220
221
222
223#define vec_unh(x) \
224 (vector signed short) \
225 vec_perm(x,(typeof(x))AVV(0),\
226 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
227 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
228#define vec_unl(x) \
229 (vector signed short) \
230 vec_perm(x,(typeof(x))AVV(0),\
231 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
232 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
233
234#define vec_clip_s16(x) \
235 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
236 (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
237
238#define vec_packclp(x,y) \
239 (vector unsigned char)vec_packs \
240 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
241 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
242
243//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
244
245
246static inline void cvtyuvtoRGB (SwsContext *c,
247 vector signed short Y, vector signed short U, vector signed short V,
248 vector signed short *R, vector signed short *G, vector signed short *B)
249{
250 vector signed short vx,ux,uvx;
251
252 Y = vec_mradds (Y, c->CY, c->OY);
253 U = vec_sub (U,(vector signed short)
254 vec_splat((vector signed short)AVV(128),0));
255 V = vec_sub (V,(vector signed short)
256 vec_splat((vector signed short)AVV(128),0));
257
258 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
259 ux = vec_sl (U, c->CSHIFT);
260 *B = vec_mradds (ux, c->CBU, Y);
261
262 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
263 vx = vec_sl (V, c->CSHIFT);
264 *R = vec_mradds (vx, c->CRV, Y);
265
266 // uvx = ((CGU*u) + (CGV*v))>>15;
267 uvx = vec_mradds (U, c->CGU, Y);
268 *G = vec_mradds (V, c->CGV, uvx);
269}
270
271
272/*
273 ------------------------------------------------------------------------------
274 CS converters
275 ------------------------------------------------------------------------------
276*/
277
278
279#define DEFCSP420_CVT(name,out_pixels) \
280static int altivec_##name (SwsContext *c, \
281 unsigned char **in, int *instrides, \
282 int srcSliceY, int srcSliceH, \
283 unsigned char **oplanes, int *outstrides) \
284{ \
285 int w = c->srcW; \
286 int h = srcSliceH; \
287 int i,j; \
288 int instrides_scl[3]; \
289 vector unsigned char y0,y1; \
290 \
291 vector signed char u,v; \
292 \
293 vector signed short Y0,Y1,Y2,Y3; \
294 vector signed short U,V; \
295 vector signed short vx,ux,uvx; \
296 vector signed short vx0,ux0,uvx0; \
297 vector signed short vx1,ux1,uvx1; \
298 vector signed short R0,G0,B0; \
299 vector signed short R1,G1,B1; \
300 vector unsigned char R,G,B; \
301 \
302 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
303 vector unsigned char align_perm; \
304 \
305 vector signed short \
306 lCY = c->CY, \
307 lOY = c->OY, \
308 lCRV = c->CRV, \
309 lCBU = c->CBU, \
310 lCGU = c->CGU, \
311 lCGV = c->CGV; \
312 \
313 vector unsigned short lCSHIFT = c->CSHIFT; \
314 \
315 ubyte *y1i = in[0]; \
316 ubyte *y2i = in[0]+instrides[0]; \
317 ubyte *ui = in[1]; \
318 ubyte *vi = in[2]; \
319 \
320 vector unsigned char *oute \
321 = (vector unsigned char *) \
322 (oplanes[0]+srcSliceY*outstrides[0]); \
323 vector unsigned char *outo \
324 = (vector unsigned char *) \
325 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
326 \
327 \
328 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
329 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
330 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
331 \
332 \
333 for (i=0;i<h/2;i++) { \
334 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
335 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
336 \
337 for (j=0;j<w/16;j++) { \
338 \
339 y1ivP = (vector unsigned char *)y1i; \
340 y2ivP = (vector unsigned char *)y2i; \
341 uivP = (vector unsigned char *)ui; \
342 vivP = (vector unsigned char *)vi; \
343 \
344 align_perm = vec_lvsl (0, y1i); \
345 y0 = (vector unsigned char) \
346 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
347 \
348 align_perm = vec_lvsl (0, y2i); \
349 y1 = (vector unsigned char) \
350 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
351 \
352 align_perm = vec_lvsl (0, ui); \
353 u = (vector signed char) \
354 vec_perm (uivP[0], uivP[1], align_perm); \
355 \
356 align_perm = vec_lvsl (0, vi); \
357 v = (vector signed char) \
358 vec_perm (vivP[0], vivP[1], align_perm); \
359 \
360 u = (vector signed char) \
361 vec_sub (u,(vector signed char) \
362 vec_splat((vector signed char)AVV(128),0)); \
363 v = (vector signed char) \
364 vec_sub (v,(vector signed char) \
365 vec_splat((vector signed char)AVV(128),0)); \
366 \
367 U = vec_unpackh (u); \
368 V = vec_unpackh (v); \
369 \
370 \
371 Y0 = vec_unh (y0); \
372 Y1 = vec_unl (y0); \
373 Y2 = vec_unh (y1); \
374 Y3 = vec_unl (y1); \
375 \
376 Y0 = vec_mradds (Y0, lCY, lOY); \
377 Y1 = vec_mradds (Y1, lCY, lOY); \
378 Y2 = vec_mradds (Y2, lCY, lOY); \
379 Y3 = vec_mradds (Y3, lCY, lOY); \
380 \
381 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
382 ux = vec_sl (U, lCSHIFT); \
383 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
384 ux0 = vec_mergeh (ux,ux); \
385 ux1 = vec_mergel (ux,ux); \
386 \
387 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
388 vx = vec_sl (V, lCSHIFT); \
389 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
390 vx0 = vec_mergeh (vx,vx); \
391 vx1 = vec_mergel (vx,vx); \
392 \
393 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
394 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
395 uvx = vec_mradds (V, lCGV, uvx); \
396 uvx0 = vec_mergeh (uvx,uvx); \
397 uvx1 = vec_mergel (uvx,uvx); \
398 \
399 R0 = vec_add (Y0,vx0); \
400 G0 = vec_add (Y0,uvx0); \
401 B0 = vec_add (Y0,ux0); \
402 R1 = vec_add (Y1,vx1); \
403 G1 = vec_add (Y1,uvx1); \
404 B1 = vec_add (Y1,ux1); \
405 \
406 R = vec_packclp (R0,R1); \
407 G = vec_packclp (G0,G1); \
408 B = vec_packclp (B0,B1); \
409 \
410 out_pixels(R,G,B,oute); \
411 \
412 R0 = vec_add (Y2,vx0); \
413 G0 = vec_add (Y2,uvx0); \
414 B0 = vec_add (Y2,ux0); \
415 R1 = vec_add (Y3,vx1); \
416 G1 = vec_add (Y3,uvx1); \
417 B1 = vec_add (Y3,ux1); \
418 R = vec_packclp (R0,R1); \
419 G = vec_packclp (G0,G1); \
420 B = vec_packclp (B0,B1); \
421 \
422 \
423 out_pixels(R,G,B,outo); \
424 \
425 y1i += 16; \
426 y2i += 16; \
427 ui += 8; \
428 vi += 8; \
429 \
430 } \
431 \
432 outo += (outstrides[0])>>4; \
433 oute += (outstrides[0])>>4; \
434 \
435 ui += instrides_scl[1]; \
436 vi += instrides_scl[2]; \
437 y1i += instrides_scl[0]; \
438 y2i += instrides_scl[0]; \
439 } \
440 return srcSliceH; \
441}
442
443
444#define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
445#define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
446#define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
447#define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
448#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
449#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
450
451DEFCSP420_CVT (yuv2_abgr, out_abgr)
452#if 1
453DEFCSP420_CVT (yuv2_bgra, out_bgra)
454#else
455static int altivec_yuv2_bgra32 (SwsContext *c,
456 unsigned char **in, int *instrides,
457 int srcSliceY, int srcSliceH,
458 unsigned char **oplanes, int *outstrides)
459{
460 int w = c->srcW;
461 int h = srcSliceH;
462 int i,j;
463 int instrides_scl[3];
464 vector unsigned char y0,y1;
465
466 vector signed char u,v;
467
468 vector signed short Y0,Y1,Y2,Y3;
469 vector signed short U,V;
470 vector signed short vx,ux,uvx;
471 vector signed short vx0,ux0,uvx0;
472 vector signed short vx1,ux1,uvx1;
473 vector signed short R0,G0,B0;
474 vector signed short R1,G1,B1;
475 vector unsigned char R,G,B;
476
477 vector unsigned char *uivP, *vivP;
478 vector unsigned char align_perm;
479
480 vector signed short
481 lCY = c->CY,
482 lOY = c->OY,
483 lCRV = c->CRV,
484 lCBU = c->CBU,
485 lCGU = c->CGU,
486 lCGV = c->CGV;
487
488 vector unsigned short lCSHIFT = c->CSHIFT;
489
490 ubyte *y1i = in[0];
491 ubyte *y2i = in[0]+w;
492 ubyte *ui = in[1];
493 ubyte *vi = in[2];
494
495 vector unsigned char *oute
496 = (vector unsigned char *)
497 (oplanes[0]+srcSliceY*outstrides[0]);
498 vector unsigned char *outo
499 = (vector unsigned char *)
500 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
501
502
503 instrides_scl[0] = instrides[0];
504 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
505 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
506
507
508 for (i=0;i<h/2;i++) {
509 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
510 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
511
512 for (j=0;j<w/16;j++) {
513
514 y0 = vec_ldl (0,y1i);
515 y1 = vec_ldl (0,y2i);
516 uivP = (vector unsigned char *)ui;
517 vivP = (vector unsigned char *)vi;
518
519 align_perm = vec_lvsl (0, ui);
520 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
521
522 align_perm = vec_lvsl (0, vi);
523 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
524 u = (vector signed char)
525 vec_sub (u,(vector signed char)
526 vec_splat((vector signed char)AVV(128),0));
527
528 v = (vector signed char)
529 vec_sub (v, (vector signed char)
530 vec_splat((vector signed char)AVV(128),0));
531
532 U = vec_unpackh (u);
533 V = vec_unpackh (v);
534
535
536 Y0 = vec_unh (y0);
537 Y1 = vec_unl (y0);
538 Y2 = vec_unh (y1);
539 Y3 = vec_unl (y1);
540
541 Y0 = vec_mradds (Y0, lCY, lOY);
542 Y1 = vec_mradds (Y1, lCY, lOY);
543 Y2 = vec_mradds (Y2, lCY, lOY);
544 Y3 = vec_mradds (Y3, lCY, lOY);
545
546 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
547 ux = vec_sl (U, lCSHIFT);
548 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
549 ux0 = vec_mergeh (ux,ux);
550 ux1 = vec_mergel (ux,ux);
551
552 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
553 vx = vec_sl (V, lCSHIFT);
554 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
555 vx0 = vec_mergeh (vx,vx);
556 vx1 = vec_mergel (vx,vx);
557 /* uvx = ((CGU*u) + (CGV*v))>>15 */
558 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
559 uvx = vec_mradds (V, lCGV, uvx);
560 uvx0 = vec_mergeh (uvx,uvx);
561 uvx1 = vec_mergel (uvx,uvx);
562 R0 = vec_add (Y0,vx0);
563 G0 = vec_add (Y0,uvx0);
564 B0 = vec_add (Y0,ux0);
565 R1 = vec_add (Y1,vx1);
566 G1 = vec_add (Y1,uvx1);
567 B1 = vec_add (Y1,ux1);
568 R = vec_packclp (R0,R1);
569 G = vec_packclp (G0,G1);
570 B = vec_packclp (B0,B1);
571
572 out_argb(R,G,B,oute);
573 R0 = vec_add (Y2,vx0);
574 G0 = vec_add (Y2,uvx0);
575 B0 = vec_add (Y2,ux0);
576 R1 = vec_add (Y3,vx1);
577 G1 = vec_add (Y3,uvx1);
578 B1 = vec_add (Y3,ux1);
579 R = vec_packclp (R0,R1);
580 G = vec_packclp (G0,G1);
581 B = vec_packclp (B0,B1);
582
583 out_argb(R,G,B,outo);
584 y1i += 16;
585 y2i += 16;
586 ui += 8;
587 vi += 8;
588
589 }
590
591 outo += (outstrides[0])>>4;
592 oute += (outstrides[0])>>4;
593
594 ui += instrides_scl[1];
595 vi += instrides_scl[2];
596 y1i += instrides_scl[0];
597 y2i += instrides_scl[0];
598 }
599 return srcSliceH;
600}
601
602#endif
603
604
605DEFCSP420_CVT (yuv2_rgba, out_rgba)
606DEFCSP420_CVT (yuv2_argb, out_argb)
607DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
608DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
609
610
611// uyvy|uyvy|uyvy|uyvy
612// 0123 4567 89ab cdef
613static
614const vector unsigned char
615 demux_u = AVV(0x10,0x00,0x10,0x00,
616 0x10,0x04,0x10,0x04,
617 0x10,0x08,0x10,0x08,
618 0x10,0x0c,0x10,0x0c),
619 demux_v = AVV(0x10,0x02,0x10,0x02,
620 0x10,0x06,0x10,0x06,
621 0x10,0x0A,0x10,0x0A,
622 0x10,0x0E,0x10,0x0E),
623 demux_y = AVV(0x10,0x01,0x10,0x03,
624 0x10,0x05,0x10,0x07,
625 0x10,0x09,0x10,0x0B,
626 0x10,0x0D,0x10,0x0F);
627
628/*
629 this is so I can play live CCIR raw video
630*/
631static int altivec_uyvy_rgb32 (SwsContext *c,
632 unsigned char **in, int *instrides,
633 int srcSliceY, int srcSliceH,
634 unsigned char **oplanes, int *outstrides)
635{
636 int w = c->srcW;
637 int h = srcSliceH;
638 int i,j;
639 vector unsigned char uyvy;
640 vector signed short Y,U,V;
641 vector signed short R0,G0,B0,R1,G1,B1;
642 vector unsigned char R,G,B;
643 vector unsigned char *out;
644 ubyte *img;
645
646 img = in[0];
647 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
648
649 for (i=0;i<h;i++) {
650 for (j=0;j<w/16;j++) {
651 uyvy = vec_ld (0, img);
652 U = (vector signed short)
653 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
654
655 V = (vector signed short)
656 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
657
658 Y = (vector signed short)
659 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
660
661 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
662
663 uyvy = vec_ld (16, img);
664 U = (vector signed short)
665 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
666
667 V = (vector signed short)
668 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
669
670 Y = (vector signed short)
671 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
672
673 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
674
675 R = vec_packclp (R0,R1);
676 G = vec_packclp (G0,G1);
677 B = vec_packclp (B0,B1);
678
679 // vec_mstbgr24 (R,G,B, out);
680 out_rgba (R,G,B,out);
681
682 img += 32;
683 }
684 }
685 return srcSliceH;
686}
687
688
689
690/* Ok currently the acceleration routine only supports
691 inputs of widths a multiple of 16
692 and heights a multiple 2
693
694 So we just fall back to the C codes for this.
695*/
696SwsFunc yuv2rgb_init_altivec (SwsContext *c)
697{
698 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
699 return NULL;
700
701 /*
702 and this seems not to matter too much I tried a bunch of
703 videos with abnormal widths and MPlayer crashes elsewhere.
704 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
705 boom with X11 bad match.
706
707 */
708 if ((c->srcW & 0xf) != 0) return NULL;
709
710 switch (c->srcFormat) {
711 case PIX_FMT_YUV410P:
712 case PIX_FMT_YUV420P:
713 /*case IMGFMT_CLPL: ??? */
714 case PIX_FMT_GRAY8:
715 case PIX_FMT_NV12:
716 case PIX_FMT_NV21:
717 if ((c->srcH & 0x1) != 0)
718 return NULL;
719
720 switch(c->dstFormat){
721 case PIX_FMT_RGB24:
722 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
723 return altivec_yuv2_rgb24;
724 case PIX_FMT_BGR24:
725 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
726 return altivec_yuv2_bgr24;
727 case PIX_FMT_ARGB:
728 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
729 return altivec_yuv2_argb;
730 case PIX_FMT_ABGR:
731 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
732 return altivec_yuv2_abgr;
733 case PIX_FMT_RGBA:
734 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
735 return altivec_yuv2_rgba;
736 case PIX_FMT_BGRA:
737 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
738 return altivec_yuv2_bgra;
739 default: return NULL;
740 }
741 break;
742
743 case PIX_FMT_UYVY422:
744 switch(c->dstFormat){
745 case PIX_FMT_BGR32:
746 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
747 return altivec_uyvy_rgb32;
748 default: return NULL;
749 }
750 break;
751
752 }
753 return NULL;
754}
755
756void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
757{
758 union {
759 signed short tmp[8] __attribute__ ((aligned(16)));
760 vector signed short vec;
761 } buf;
762
763 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
764 buf.tmp[1] = -256*brightness; //oy
765 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
766 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
767 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
768 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
769
770
771 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
772 c->CY = vec_splat ((vector signed short)buf.vec, 0);
773 c->OY = vec_splat ((vector signed short)buf.vec, 1);
774 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
775 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
776 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
777 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
778#if 0
779 {
780 int i;
781 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
782 for (i=0; i<6; i++)
783 printf("%s %d ", v[i],buf.tmp[i] );
784 printf("\n");
785 }
786#endif
787 return;
788}
789
790
791void
792altivec_yuv2packedX (SwsContext *c,
793 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
794 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
795 uint8_t *dest, int dstW, int dstY)
796{
797 int i,j;
798 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
799 vector signed short R0,G0,B0,R1,G1,B1;
800
801 vector unsigned char R,G,B;
802 vector unsigned char *out,*nout;
803
804 vector signed short RND = vec_splat_s16(1<<3);
805 vector unsigned short SCL = vec_splat_u16(4);
806 unsigned long scratch[16] __attribute__ ((aligned (16)));
807
808 vector signed short *YCoeffs, *CCoeffs;
809
810 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
811 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
812
813 out = (vector unsigned char *)dest;
814
815 for (i=0; i<dstW; i+=16){
816 Y0 = RND;
817 Y1 = RND;
818 /* extract 16 coeffs from lumSrc */
819 for (j=0; j<lumFilterSize; j++) {
820 X0 = vec_ld (0, &lumSrc[j][i]);
821 X1 = vec_ld (16, &lumSrc[j][i]);
822 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
823 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
824 }
825
826 U = RND;
827 V = RND;
828 /* extract 8 coeffs from U,V */
829 for (j=0; j<chrFilterSize; j++) {
830 X = vec_ld (0, &chrSrc[j][i/2]);
831 U = vec_mradds (X, CCoeffs[j], U);
832 X = vec_ld (0, &chrSrc[j][i/2+2048]);
833 V = vec_mradds (X, CCoeffs[j], V);
834 }
835
836 /* scale and clip signals */
837 Y0 = vec_sra (Y0, SCL);
838 Y1 = vec_sra (Y1, SCL);
839 U = vec_sra (U, SCL);
840 V = vec_sra (V, SCL);
841
842 Y0 = vec_clip_s16 (Y0);
843 Y1 = vec_clip_s16 (Y1);
844 U = vec_clip_s16 (U);
845 V = vec_clip_s16 (V);
846
847 /* now we have
848 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
849 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
850
851 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
852 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
853 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
854 */
855
856 U0 = vec_mergeh (U,U);
857 V0 = vec_mergeh (V,V);
858
859 U1 = vec_mergel (U,U);
860 V1 = vec_mergel (V,V);
861
862 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
863 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
864
865 R = vec_packclp (R0,R1);
866 G = vec_packclp (G0,G1);
867 B = vec_packclp (B0,B1);
868
869 switch(c->dstFormat) {
870 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
871 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
872 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
873 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
874 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
875 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
876 default:
877 {
878 /* If this is reached, the caller should have called yuv2packedXinC
879 instead. */
880 static int printed_error_message;
881 if (!printed_error_message) {
882 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
883 sws_format_name(c->dstFormat));
884 printed_error_message=1;
885 }
886 return;
887 }
888 }
889 }
890
891 if (i < dstW) {
892 i -= 16;
893
894 Y0 = RND;
895 Y1 = RND;
896 /* extract 16 coeffs from lumSrc */
897 for (j=0; j<lumFilterSize; j++) {
898 X0 = vec_ld (0, &lumSrc[j][i]);
899 X1 = vec_ld (16, &lumSrc[j][i]);
900 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
902 }
903
904 U = RND;
905 V = RND;
906 /* extract 8 coeffs from U,V */
907 for (j=0; j<chrFilterSize; j++) {
908 X = vec_ld (0, &chrSrc[j][i/2]);
909 U = vec_mradds (X, CCoeffs[j], U);
910 X = vec_ld (0, &chrSrc[j][i/2+2048]);
911 V = vec_mradds (X, CCoeffs[j], V);
912 }
913
914 /* scale and clip signals */
915 Y0 = vec_sra (Y0, SCL);
916 Y1 = vec_sra (Y1, SCL);
917 U = vec_sra (U, SCL);
918 V = vec_sra (V, SCL);
919
920 Y0 = vec_clip_s16 (Y0);
921 Y1 = vec_clip_s16 (Y1);
922 U = vec_clip_s16 (U);
923 V = vec_clip_s16 (V);
924
925 /* now we have
926 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
927 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
928
929 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
930 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
931 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
932 */
933
934 U0 = vec_mergeh (U,U);
935 V0 = vec_mergeh (V,V);
936
937 U1 = vec_mergel (U,U);
938 V1 = vec_mergel (V,V);
939
940 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
942
943 R = vec_packclp (R0,R1);
944 G = vec_packclp (G0,G1);
945 B = vec_packclp (B0,B1);
946
947 nout = (vector unsigned char *)scratch;
948 switch(c->dstFormat) {
949 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
950 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
951 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
952 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
953 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
954 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
955 default:
956 /* Unreachable, I think. */
957 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
958 sws_format_name(c->dstFormat));
959 return;
960 }
961
962 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
963 }
964
965}
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c
deleted file mode 100644
index 1500a96..0000000
--- a/src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c
+++ /dev/null
@@ -1,206 +0,0 @@
1/*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 *
4 * Blackfin video color space converter operations
5 * convert I420 YV12 to RGB in various formats
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <inttypes.h>
28#include <assert.h>
29#include "config.h"
30#ifdef HAVE_MALLOC_H
31#include <malloc.h>
32#endif
33#include <unistd.h>
34#include "rgb2rgb.h"
35#include "swscale.h"
36#include "swscale_internal.h"
37
38#ifdef __FDPIC__
39#define L1CODE __attribute__ ((l1_text))
40#else
41#define L1CODE
42#endif
43
44extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
45 int w, uint32_t *coeffs) L1CODE;
46
47extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
48 int w, uint32_t *coeffs) L1CODE;
49
50extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
51 int w, uint32_t *coeffs) L1CODE;
52
53typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
54 int w, uint32_t *coeffs);
55
56
57static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks)
58{
59 int oy;
60 oy = c->yOffset&0xffff;
61 oy = oy >> 3; // keep everything U8.0 for offset calculation
62
63 c->oc = 128*0x01010101U;
64 c->oy = oy*0x01010101U;
65
66 /* copy 64bit vector coeffs down to 32bit vector coeffs */
67 c->cy = c->yCoeff;
68 c->zero = 0;
69
70 if (rgb) {
71 c->crv = c->vrCoeff;
72 c->cbu = c->ubCoeff;
73 c->cgu = c->ugCoeff;
74 c->cgv = c->vgCoeff;
75 } else {
76 c->crv = c->ubCoeff;
77 c->cbu = c->vrCoeff;
78 c->cgu = c->vgCoeff;
79 c->cgv = c->ugCoeff;
80 }
81
82
83 if (masks == 555) {
84 c->rmask = 0x001f * 0x00010001U;
85 c->gmask = 0x03e0 * 0x00010001U;
86 c->bmask = 0x7c00 * 0x00010001U;
87 } else if (masks == 565) {
88 c->rmask = 0x001f * 0x00010001U;
89 c->gmask = 0x07e0 * 0x00010001U;
90 c->bmask = 0xf800 * 0x00010001U;
91 }
92}
93
94static int core_yuv420_rgb (SwsContext *c,
95 uint8_t **in, int *instrides,
96 int srcSliceY, int srcSliceH,
97 uint8_t **oplanes, int *outstrides,
98 ltransform_t lcscf, int rgb, int masks)
99{
100 uint8_t *py,*pu,*pv,*op;
101 int w = instrides[0];
102 int h2 = srcSliceH>>1;
103 int i;
104
105 bfin_prepare_coefficients (c, rgb, masks);
106
107 py = in[0];
108 pu = in[1+(1^rgb)];
109 pv = in[1+(0^rgb)];
110
111 op = oplanes[0] + srcSliceY*outstrides[0];
112
113 for (i=0;i<h2;i++) {
114
115 lcscf (py, pu, pv, op, w, &c->oy);
116
117 py += instrides[0];
118 op += outstrides[0];
119
120 lcscf (py, pu, pv, op, w, &c->oy);
121
122 py += instrides[0];
123 pu += instrides[1];
124 pv += instrides[2];
125 op += outstrides[0];
126 }
127
128 return srcSliceH;
129}
130
131
132static int bfin_yuv420_rgb555 (SwsContext *c,
133 uint8_t **in, int *instrides,
134 int srcSliceY, int srcSliceH,
135 uint8_t **oplanes, int *outstrides)
136{
137 return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
138 ff_bfin_yuv2rgb555_line, 1, 555);
139}
140
141static int bfin_yuv420_bgr555 (SwsContext *c,
142 uint8_t **in, int *instrides,
143 int srcSliceY, int srcSliceH,
144 uint8_t **oplanes, int *outstrides)
145{
146 return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
147 ff_bfin_yuv2rgb555_line, 0, 555);
148}
149
150static int bfin_yuv420_rgb24 (SwsContext *c,
151 uint8_t **in, int *instrides,
152 int srcSliceY, int srcSliceH,
153 uint8_t **oplanes, int *outstrides)
154{
155 return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
156 ff_bfin_yuv2rgb24_line, 1, 888);
157}
158
159static int bfin_yuv420_bgr24 (SwsContext *c,
160 uint8_t **in, int *instrides,
161 int srcSliceY, int srcSliceH,
162 uint8_t **oplanes, int *outstrides)
163{
164 return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
165 ff_bfin_yuv2rgb24_line, 0, 888);
166}
167
168static int bfin_yuv420_rgb565 (SwsContext *c,
169 uint8_t **in, int *instrides,
170 int srcSliceY, int srcSliceH,
171 uint8_t **oplanes, int *outstrides)
172{
173 return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
174 ff_bfin_yuv2rgb565_line, 1, 565);
175}
176
177static int bfin_yuv420_bgr565 (SwsContext *c,
178 uint8_t **in, int *instrides,
179 int srcSliceY, int srcSliceH,
180 uint8_t **oplanes, int *outstrides)
181{
182 return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides,
183 ff_bfin_yuv2rgb565_line, 0, 565);
184}
185
186
187SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
188{
189 SwsFunc f;
190
191 switch(c->dstFormat) {
192 case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
193 case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
194 case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
195 case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
196 case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break;
197 case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break;
198 default:
199 return 0;
200 }
201
202 av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
203 sws_format_name (c->dstFormat));
204
205 return f;
206}
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c
deleted file mode 100644
index ff2e50a..0000000
--- a/src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * software YUV to RGB converter using mediaLib
3 *
4 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include <mlib_types.h>
24#include <mlib_status.h>
25#include <mlib_sys.h>
26#include <mlib_video.h>
27#include <inttypes.h>
28#include <stdlib.h>
29#include <assert.h>
30
31#include "swscale.h"
32
33static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
34 int srcSliceH, uint8_t* dst[], int dstStride[]){
35 if(c->srcFormat == PIX_FMT_YUV422P){
36 srcStride[1] *= 2;
37 srcStride[2] *= 2;
38 }
39
40 assert(srcStride[1] == srcStride[2]);
41
42 mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
43 srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
44 return srcSliceH;
45}
46
47static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
48 int srcSliceH, uint8_t* dst[], int dstStride[]){
49 if(c->srcFormat == PIX_FMT_YUV422P){
50 srcStride[1] *= 2;
51 srcStride[2] *= 2;
52 }
53
54 assert(srcStride[1] == srcStride[2]);
55
56 mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
57 srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
58 return srcSliceH;
59}
60
61static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
62 int srcSliceH, uint8_t* dst[], int dstStride[]){
63 if(c->srcFormat == PIX_FMT_YUV422P){
64 srcStride[1] *= 2;
65 srcStride[2] *= 2;
66 }
67
68 assert(srcStride[1] == srcStride[2]);
69
70 mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW,
71 srcSliceH, dstStride[0], srcStride[0], srcStride[1]);
72 return srcSliceH;
73}
74
75
76SwsFunc yuv2rgb_init_mlib(SwsContext *c)
77{
78 switch(c->dstFormat){
79 case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
80 case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32;
81 case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32;
82 default: return NULL;
83 }
84}
85
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_template.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_template.c
deleted file mode 100644
index 1f8e225..0000000
--- a/src/plugins/ffmpeg/libswscale/yuv2rgb_template.c
+++ /dev/null
@@ -1,538 +0,0 @@
1/*
2 * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
3 *
4 * Copyright (C) 2000, Silicon Integrated System Corp.
5 *
6 * Author: Olie Lho <ollie@sis.com.tw>
7 *
8 * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
9 * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
10 * context / deglobalize stuff by Michael Niedermayer
11 *
12 * This file is part of mpeg2dec, a free MPEG-2 video decoder
13 *
14 * mpeg2dec is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2, or (at your option)
17 * any later version.
18 *
19 * mpeg2dec is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with mpeg2dec; if not, write to the Free Software
26 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 */
28
29#undef MOVNTQ
30#undef EMMS
31#undef SFENCE
32
33#ifdef HAVE_3DNOW
34/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
35#define EMMS "femms"
36#else
37#define EMMS "emms"
38#endif
39
40#ifdef HAVE_MMX2
41#define MOVNTQ "movntq"
42#define SFENCE "sfence"
43#else
44#define MOVNTQ "movq"
45#define SFENCE "/nop"
46#endif
47
48#define YUV2RGB \
49 /* Do the multiply part of the conversion for even and odd pixels,
50 register usage:
51 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
52 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
53 mm6 -> Y even, mm7 -> Y odd */\
54 /* convert the chroma part */\
55 "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
56 "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
57\
58 "psllw $3, %%mm0;" /* Promote precision */ \
59 "psllw $3, %%mm1;" /* Promote precision */ \
60\
61 "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \
62 "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \
63\
64 "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \
65 "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \
66\
67 "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \
68 "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \
69\
70 "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\
71 "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\
72\
73 "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\
74\
75 /* convert the luma part */\
76 "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\
77 "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\
78\
79 "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\
80\
81 "psllw $3, %%mm6;" /* Promote precision */\
82 "psllw $3, %%mm7;" /* Promote precision */\
83\
84 "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\
85 "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\
86\
87 "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\
88 "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\
89\
90 /* Do the addition part of the conversion for even and odd pixels,
91 register usage:
92 mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
93 mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
94 mm6 -> Y even, mm7 -> Y odd */\
95 "movq %%mm0, %%mm3;" /* Copy Cblue */\
96 "movq %%mm1, %%mm4;" /* Copy Cred */\
97 "movq %%mm2, %%mm5;" /* Copy Cgreen */\
98\
99 "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\
100 "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\
101\
102 "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\
103 "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\
104\
105 "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\
106 "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\
107\
108 /* Limit RGB even to 0..255 */\
109 "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\
110 "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\
111 "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\
112\
113 /* Limit RGB odd to 0..255 */\
114 "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\
115 "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\
116 "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\
117\
118 /* Interleave RGB even and odd */\
119 "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\
120 "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\
121 "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
122
123
124static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
125 int srcSliceH, uint8_t* dst[], int dstStride[]){
126 int y, h_size;
127
128 if(c->srcFormat == PIX_FMT_YUV422P){
129 srcStride[1] *= 2;
130 srcStride[2] *= 2;
131 }
132
133 h_size= (c->dstW+7)&~7;
134 if(h_size*2 > FFABS(dstStride[0])) h_size-=8;
135
136 asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
137 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
138 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
139 for (y= 0; y<srcSliceH; y++ ) {
140 uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
141 uint8_t *py = src[0] + y*srcStride[0];
142 uint8_t *pu = src[1] + (y>>1)*srcStride[1];
143 uint8_t *pv = src[2] + (y>>1)*srcStride[2];
144 long index= -h_size/2;
145
146 b5Dither= ff_dither8[y&1];
147 g6Dither= ff_dither4[y&1];
148 g5Dither= ff_dither8[y&1];
149 r5Dither= ff_dither8[(y+1)&1];
150 /* This MMX assembly code deals with a SINGLE scan line at a time,
151 * it converts 8 pixels in each iteration. */
152 asm volatile (
153 /* load data for start of next scan line */
154 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
155 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
156 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
157 //".balign 16 \n\t"
158 "1: \n\t"
159 /* No speed difference on my p3@500 with prefetch,
160 * if it is faster for anyone with -benchmark then tell me.
161 PREFETCH" 64(%0) \n\t"
162 PREFETCH" 64(%1) \n\t"
163 PREFETCH" 64(%2) \n\t"
164 */
165YUV2RGB
166
167#ifdef DITHER1XBPP
168 "paddusb "MANGLE(b5Dither)", %%mm0;"
169 "paddusb "MANGLE(g6Dither)", %%mm2;"
170 "paddusb "MANGLE(r5Dither)", %%mm1;"
171#endif
172 /* mask unneeded bits off */
173 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
174 "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */
175 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
176
177 "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
178 "pxor %%mm4, %%mm4;" /* zero mm4 */
179
180 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
181 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
182
183 /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
184 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
185 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
186
187 "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
188 "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
189
190 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
191 MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
192
193 /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
194 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
195 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
196
197 "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */
198 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
199
200 "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */
201 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
202
203 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
204
205 "add $16, %1 \n\t"
206 "add $4, %0 \n\t"
207 " js 1b \n\t"
208
209 : "+r" (index), "+r" (image)
210 : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
211 );
212 }
213
214 asm volatile (EMMS);
215
216 return srcSliceH;
217}
218
219static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
220 int srcSliceH, uint8_t* dst[], int dstStride[]){
221 int y, h_size;
222
223 if(c->srcFormat == PIX_FMT_YUV422P){
224 srcStride[1] *= 2;
225 srcStride[2] *= 2;
226 }
227
228 h_size= (c->dstW+7)&~7;
229 if(h_size*2 > FFABS(dstStride[0])) h_size-=8;
230
231 asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
232 //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
233 //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
234 for (y= 0; y<srcSliceH; y++ ) {
235 uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
236 uint8_t *py = src[0] + y*srcStride[0];
237 uint8_t *pu = src[1] + (y>>1)*srcStride[1];
238 uint8_t *pv = src[2] + (y>>1)*srcStride[2];
239 long index= -h_size/2;
240
241 b5Dither= ff_dither8[y&1];
242 g6Dither= ff_dither4[y&1];
243 g5Dither= ff_dither8[y&1];
244 r5Dither= ff_dither8[(y+1)&1];
245 /* This MMX assembly code deals with a SINGLE scan line at a time,
246 * it converts 8 pixels in each iteration. */
247 asm volatile (
248 /* load data for start of next scan line */
249 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
250 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
251 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
252 //".balign 16 \n\t"
253 "1: \n\t"
254YUV2RGB
255
256#ifdef DITHER1XBPP
257 "paddusb "MANGLE(b5Dither)", %%mm0 \n\t"
258 "paddusb "MANGLE(g5Dither)", %%mm2 \n\t"
259 "paddusb "MANGLE(r5Dither)", %%mm1 \n\t"
260#endif
261
262 /* mask unneeded bits off */
263 "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
264 "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */
265 "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */
266
267 "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */
268 "psrlw $1, %%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */
269 "pxor %%mm4, %%mm4;" /* zero mm4 */
270
271 "movq %%mm0, %%mm5;" /* Copy B7-B0 */
272 "movq %%mm2, %%mm7;" /* Copy G7-G0 */
273
274 /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
275 "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
276 "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
277
278 "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
279 "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
280
281 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
282 MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
283
284 /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
285 "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
286 "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
287
288 "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */
289 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
290
291 "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */
292 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
293
294 MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
295
296 "add $16, %1 \n\t"
297 "add $4, %0 \n\t"
298 " js 1b \n\t"
299 : "+r" (index), "+r" (image)
300 : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
301 );
302 }
303
304 asm volatile (EMMS);
305 return srcSliceH;
306}
307
308static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
309 int srcSliceH, uint8_t* dst[], int dstStride[]){
310 int y, h_size;
311
312 if(c->srcFormat == PIX_FMT_YUV422P){
313 srcStride[1] *= 2;
314 srcStride[2] *= 2;
315 }
316
317 h_size= (c->dstW+7)&~7;
318 if(h_size*3 > FFABS(dstStride[0])) h_size-=8;
319
320 asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
321
322 for (y= 0; y<srcSliceH; y++ ) {
323 uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
324 uint8_t *py = src[0] + y*srcStride[0];
325 uint8_t *pu = src[1] + (y>>1)*srcStride[1];
326 uint8_t *pv = src[2] + (y>>1)*srcStride[2];
327 long index= -h_size/2;
328
329 /* This MMX assembly code deals with a SINGLE scan line at a time,
330 * it converts 8 pixels in each iteration. */
331 asm volatile (
332 /* load data for start of next scan line */
333 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
334 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
335 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
336 //".balign 16 \n\t"
337 "1: \n\t"
338YUV2RGB
339 /* mm0=B, %%mm2=G, %%mm1=R */
340#ifdef HAVE_MMX2
341 "movq "MANGLE(ff_M24A)", %%mm4 \n\t"
342 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"
343 "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */
344 "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */
345 "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */
346
347 "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */
348 "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */
349 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */
350
351 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */
352 "por %%mm5, %%mm6 \n\t"
353 "por %%mm3, %%mm6 \n\t"
354 MOVNTQ" %%mm6, (%1) \n\t"
355
356 "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */
357 "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */
358 "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */
359 "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */
360
361 "pand "MANGLE(ff_M24B)", %%mm5 \n\t" /* B5 B4 B3 */
362 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */
363 "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */
364
365 "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */
366 "por %%mm3, %%mm6 \n\t"
367 MOVNTQ" %%mm6, 8(%1) \n\t"
368
369 "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */
370 "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */
371 "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */
372 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
373
374 "pand %%mm7, %%mm5 \n\t" /* B7 B6 */
375 "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */
376 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */
377 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
378\
379 "por %%mm5, %%mm3 \n\t"
380 "por %%mm3, %%mm6 \n\t"
381 MOVNTQ" %%mm6, 16(%1) \n\t"
382 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
383 "pxor %%mm4, %%mm4 \n\t"
384
385#else
386
387 "pxor %%mm4, %%mm4 \n\t"
388 "movq %%mm0, %%mm5 \n\t" /* B */
389 "movq %%mm1, %%mm6 \n\t" /* R */
390 "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */
391 "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */
392 "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */
393 "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */
394 "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */
395 "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */
396 "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */
397 "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */
398 "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */
399 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */
400
401 "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */
402 "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */
403 "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */
404 "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */
405
406 "psllq $40, %%mm7 \n\t" /* RGB00000 0 */
407 "psllq $40, %%mm0 \n\t" /* RGB00000 1 */
408 "psllq $40, %%mm5 \n\t" /* RGB00000 2 */
409 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */
410
411 "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */
412 "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */
413 "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */
414 "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */
415
416 "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */
417 "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */
418 "psllq $40, %%mm0 \n\t" /* GB000000 1 */
419 "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */
420 MOVNTQ" %%mm7, (%1) \n\t"
421
422 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
423
424 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */
425 "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */
426 "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */
427 "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */
428 MOVNTQ" %%mm6, 8(%1) \n\t"
429
430 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
431
432 "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */
433 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */
434 "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */
435 MOVNTQ" %%mm1, 16(%1) \n\t"
436
437 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
438 "pxor %%mm4, %%mm4 \n\t"
439#endif
440
441 "add $24, %1 \n\t"
442 "add $4, %0 \n\t"
443 " js 1b \n\t"
444
445 : "+r" (index), "+r" (image)
446 : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
447 );
448 }
449
450 asm volatile (EMMS);
451 return srcSliceH;
452}
453
454static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
455 int srcSliceH, uint8_t* dst[], int dstStride[]){
456 int y, h_size;
457
458 if(c->srcFormat == PIX_FMT_YUV422P){
459 srcStride[1] *= 2;
460 srcStride[2] *= 2;
461 }
462
463 h_size= (c->dstW+7)&~7;
464 if(h_size*4 > FFABS(dstStride[0])) h_size-=8;
465
466 asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
467
468 for (y= 0; y<srcSliceH; y++ ) {
469 uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
470 uint8_t *py = src[0] + y*srcStride[0];
471 uint8_t *pu = src[1] + (y>>1)*srcStride[1];
472 uint8_t *pv = src[2] + (y>>1)*srcStride[2];
473 long index= -h_size/2;
474
475 /* This MMX assembly code deals with a SINGLE scan line at a time,
476 * it converts 8 pixels in each iteration. */
477 asm volatile (
478 /* load data for start of next scan line */
479 "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
480 "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
481 "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
482 //".balign 16 \n\t"
483 "1: \n\t"
484YUV2RGB
485 /* convert RGB plane to RGB packed format,
486 mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
487 mm4 -> GB, mm5 -> AR pixel 4-7,
488 mm6 -> GB, mm7 -> AR pixel 0-3 */
489 "pxor %%mm3, %%mm3;" /* zero mm3 */
490
491 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
492 "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
493
494 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
495 "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */
496
497 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
498 "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */
499
500 "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */
501 MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */
502
503 "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
504 "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */
505
506 "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */
507 MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */
508
509 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
510 "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */
511
512 "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */
513 MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */
514
515 "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */
516 "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */
517
518 "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */
519 MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */
520
521 "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
522 "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
523
524 "pxor %%mm4, %%mm4;" /* zero mm4 */
525 "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
526
527 "add $32, %1 \n\t"
528 "add $4, %0 \n\t"
529 " js 1b \n\t"
530
531 : "+r" (index), "+r" (image)
532 : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
533 );
534 }
535
536 asm volatile (EMMS);
537 return srcSliceH;
538}
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c
deleted file mode 100644
index 120fa56..0000000
--- a/src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c
+++ /dev/null
@@ -1,207 +0,0 @@
1/*
2 * VIS optimized software YUV to RGB converter
3 * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include <inttypes.h>
23#include <stdlib.h>
24
25#include "swscale.h"
26#include "swscale_internal.h"
27
28#define YUV2RGB_INIT \
29 "wr %%g0, 0x10, %%gsr \n\t" \
30 "ldd [%5], %%f32 \n\t" \
31 "ldd [%5+8], %%f34 \n\t" \
32 "ldd [%5+16], %%f36 \n\t" \
33 "ldd [%5+24], %%f38 \n\t" \
34 "ldd [%5+32], %%f40 \n\t" \
35 "ldd [%5+40], %%f42 \n\t" \
36 "ldd [%5+48], %%f44 \n\t" \
37 "ldd [%5+56], %%f46 \n\t" \
38 "ldd [%5+64], %%f48 \n\t" \
39 "ldd [%5+72], %%f50 \n\t"
40
41#define YUV2RGB_KERNEL \
42 /* ^^^^ f0=Y f3=u f5=v */ \
43 "fmul8x16 %%f3, %%f48, %%f6 \n\t" \
44 "fmul8x16 %%f19, %%f48, %%f22 \n\t" \
45 "fmul8x16 %%f5, %%f44, %%f8 \n\t" \
46 "fmul8x16 %%f21, %%f44, %%f24 \n\t" \
47 "fmul8x16 %%f0, %%f42, %%f0 \n\t" \
48 "fmul8x16 %%f16, %%f42, %%f16 \n\t" \
49 "fmul8x16 %%f3, %%f50, %%f2 \n\t" \
50 "fmul8x16 %%f19, %%f50, %%f18 \n\t" \
51 "fmul8x16 %%f5, %%f46, %%f4 \n\t" \
52 "fmul8x16 %%f21, %%f46, %%f20 \n\t" \
53 \
54 "fpsub16 %%f6, %%f34, %%f6 \n\t" /* 1 */ \
55 "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
56 "fpsub16 %%f8, %%f38, %%f8 \n\t" /* 3 */ \
57 "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
58 "fpsub16 %%f0, %%f32, %%f0 \n\t" /* 0 */ \
59 "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
60 "fpsub16 %%f2, %%f36, %%f2 \n\t" /* 2 */ \
61 "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
62 "fpsub16 %%f4, %%f40, %%f4 \n\t" /* 4 */ \
63 "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
64 \
65 "fpadd16 %%f0, %%f8, %%f8 \n\t" /* Gt */ \
66 "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
67 "fpadd16 %%f0, %%f4, %%f4 \n\t" /* R */ \
68 "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
69 "fpadd16 %%f0, %%f6, %%f6 \n\t" /* B */ \
70 "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
71 "fpadd16 %%f8, %%f2, %%f2 \n\t" /* G */ \
72 "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
73 \
74 "fpack16 %%f4, %%f4 \n\t" \
75 "fpack16 %%f20, %%f20 \n\t" \
76 "fpack16 %%f6, %%f6 \n\t" \
77 "fpack16 %%f22, %%f22 \n\t" \
78 "fpack16 %%f2, %%f2 \n\t" \
79 "fpack16 %%f18, %%f18 \n\t"
80
81
82
83static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
84 int srcSliceH, uint8_t* dst[], int dstStride[]){
85 int y, out1, out2, out3, out4, out5, out6;
86
87 for(y=0;y < srcSliceH;++y) {
88 asm volatile (
89 YUV2RGB_INIT
90 "wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */
91 "1: \n\t"
92 "ldda [%1] %%asi, %%f2 \n\t"
93 "ldda [%1+2] %%asi, %%f18 \n\t"
94 "ldda [%2] %%asi, %%f4 \n\t"
95 "ldda [%2+2] %%asi, %%f20 \n\t"
96 "ld [%0], %%f0 \n\t"
97 "ld [%0+4], %%f16 \n\t"
98 "fpmerge %%f3, %%f3, %%f2 \n\t"
99 "fpmerge %%f19, %%f19, %%f18 \n\t"
100 "fpmerge %%f5, %%f5, %%f4 \n\t"
101 "fpmerge %%f21, %%f21, %%f20 \n\t"
102 YUV2RGB_KERNEL
103 "fzero %%f0 \n\t"
104 "fpmerge %%f4, %%f6, %%f8 \n\t" // r,b,t1
105 "fpmerge %%f20, %%f22, %%f24 \n\t" // r,b,t1
106 "fpmerge %%f0, %%f2, %%f10 \n\t" // 0,g,t2
107 "fpmerge %%f0, %%f18, %%f26 \n\t" // 0,g,t2
108 "fpmerge %%f10, %%f8, %%f4 \n\t" // t2,t1,msb
109 "fpmerge %%f26, %%f24, %%f20 \n\t" // t2,t1,msb
110 "fpmerge %%f11, %%f9, %%f6 \n\t" // t2,t1,lsb
111 "fpmerge %%f27, %%f25, %%f22 \n\t" // t2,t1,lsb
112 "std %%f4, [%3] \n\t"
113 "std %%f20, [%3+16] \n\t"
114 "std %%f6, [%3+8] \n\t"
115 "std %%f22, [%3+24] \n\t"
116
117 "add %0, 8, %0 \n\t"
118 "add %1, 4, %1 \n\t"
119 "add %2, 4, %2 \n\t"
120 "subcc %4, 8, %4 \n\t"
121 "bne 1b \n\t"
122 "add %3, 32, %3 \n\t" //delay slot
123 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
124 : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+((y+srcSliceY)>>1)*srcStride[1]),
125 "2" (src[2]+((y+srcSliceY)>>1)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
126 "4" (c->dstW),
127 "5" (c->sparc_coeffs)
128 );
129 }
130
131 return srcSliceH;
132}
133
134static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
135 int srcSliceH, uint8_t* dst[], int dstStride[]){
136 int y, out1, out2, out3, out4, out5, out6;
137
138 for(y=0;y < srcSliceH;++y) {
139 asm volatile (
140 YUV2RGB_INIT
141 "wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */
142 "1: \n\t"
143 "ldda [%1] %%asi, %%f2 \n\t"
144 "ldda [%1+2] %%asi, %%f18 \n\t"
145 "ldda [%2] %%asi, %%f4 \n\t"
146 "ldda [%2+2] %%asi, %%f20 \n\t"
147 "ld [%0], %%f0 \n\t"
148 "ld [%0+4], %%f16 \n\t"
149 "fpmerge %%f3, %%f3, %%f2 \n\t"
150 "fpmerge %%f19, %%f19, %%f18 \n\t"
151 "fpmerge %%f5, %%f5, %%f4 \n\t"
152 "fpmerge %%f21, %%f21, %%f20 \n\t"
153 YUV2RGB_KERNEL
154 "fzero %%f0 \n\t"
155 "fpmerge %%f4, %%f6, %%f8 \n\t" // r,b,t1
156 "fpmerge %%f20, %%f22, %%f24 \n\t" // r,b,t1
157 "fpmerge %%f0, %%f2, %%f10 \n\t" // 0,g,t2
158 "fpmerge %%f0, %%f18, %%f26 \n\t" // 0,g,t2
159 "fpmerge %%f10, %%f8, %%f4 \n\t" // t2,t1,msb
160 "fpmerge %%f26, %%f24, %%f20 \n\t" // t2,t1,msb
161 "fpmerge %%f11, %%f9, %%f6 \n\t" // t2,t1,lsb
162 "fpmerge %%f27, %%f25, %%f22 \n\t" // t2,t1,lsb
163 "std %%f4, [%3] \n\t"
164 "std %%f20, [%3+16] \n\t"
165 "std %%f6, [%3+8] \n\t"
166 "std %%f22, [%3+24] \n\t"
167
168 "add %0, 8, %0 \n\t"
169 "add %1, 4, %1 \n\t"
170 "add %2, 4, %2 \n\t"
171 "subcc %4, 8, %4 \n\t"
172 "bne 1b \n\t"
173 "add %3, 32, %3 \n\t" //delay slot
174 : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
175 : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+(y+srcSliceY)*srcStride[1]),
176 "2" (src[2]+(y+srcSliceY)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]),
177 "4" (c->dstW),
178 "5" (c->sparc_coeffs)
179 );
180 }
181
182 return srcSliceH;
183}
184
185SwsFunc yuv2rgb_init_vis(SwsContext *c) {
186 c->sparc_coeffs[5]=c->yCoeff;
187 c->sparc_coeffs[6]=c->vgCoeff;
188 c->sparc_coeffs[7]=c->vrCoeff;
189 c->sparc_coeffs[8]=c->ubCoeff;
190 c->sparc_coeffs[9]=c->ugCoeff;
191
192 c->sparc_coeffs[0]=(((int16_t)c->yOffset*(int16_t)c->yCoeff >>11) & 0xffff) * 0x0001000100010001ULL;
193 c->sparc_coeffs[1]=(((int16_t)c->uOffset*(int16_t)c->ubCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
194 c->sparc_coeffs[2]=(((int16_t)c->uOffset*(int16_t)c->ugCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
195 c->sparc_coeffs[3]=(((int16_t)c->vOffset*(int16_t)c->vgCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
196 c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
197
198 if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) {
199 av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32\n");
200 return vis_422P_ARGB32;
201 }
202 else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) {
203 av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32\n");
204 return vis_420P_ARGB32;
205 }
206 return NULL;
207}