diff options
Diffstat (limited to 'src/plugins/ffmpeg/libswscale')
20 files changed, 0 insertions, 14687 deletions
diff --git a/src/plugins/ffmpeg/libswscale/Makefile b/src/plugins/ffmpeg/libswscale/Makefile deleted file mode 100644 index 93d27ba..0000000 --- a/src/plugins/ffmpeg/libswscale/Makefile +++ /dev/null | |||
@@ -1,25 +0,0 @@ | |||
1 | include $(SUBDIR)../config.mak | ||
2 | |||
3 | NAME = swscale | ||
4 | FFLIBS = avutil | ||
5 | |||
6 | OBJS = rgb2rgb.o swscale.o swscale_avoption.o | ||
7 | |||
8 | OBJS-$(ARCH_BFIN) += swscale_bfin.o yuv2rgb_bfin.o | ||
9 | OBJS-$(CONFIG_GPL) += yuv2rgb.o | ||
10 | OBJS-$(CONFIG_MLIB) += yuv2rgb_mlib.o | ||
11 | OBJS-$(HAVE_ALTIVEC) += yuv2rgb_altivec.o | ||
12 | OBJS-$(HAVE_VIS) += yuv2rgb_vis.o | ||
13 | |||
14 | ASM_OBJS-$(ARCH_BFIN) += internal_bfin.o | ||
15 | |||
16 | HEADERS = swscale.h rgb2rgb.h | ||
17 | |||
18 | CLEANFILES = cs_test swscale-example | ||
19 | |||
20 | include $(SUBDIR)../subdir.mak | ||
21 | |||
22 | $(SUBDIR)cs_test: $(SUBDIR)cs_test.o $(SUBDIR)$(LIBNAME) | ||
23 | |||
24 | $(SUBDIR)swscale-example: $(SUBDIR)swscale-example.o $(SUBDIR)$(LIBNAME) | ||
25 | $(SUBDIR)swscale-example: EXTRALIBS += -lm | ||
diff --git a/src/plugins/ffmpeg/libswscale/cs_test.c b/src/plugins/ffmpeg/libswscale/cs_test.c deleted file mode 100644 index d49a605..0000000 --- a/src/plugins/ffmpeg/libswscale/cs_test.c +++ /dev/null | |||
@@ -1,175 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2.1 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | */ | ||
20 | |||
21 | #include <stdio.h> | ||
22 | #include <string.h> /* for memset() */ | ||
23 | #include <unistd.h> | ||
24 | #include <stdlib.h> | ||
25 | #include <inttypes.h> | ||
26 | |||
27 | #include "swscale.h" | ||
28 | #include "rgb2rgb.h" | ||
29 | |||
30 | #define SIZE 1000 | ||
31 | #define srcByte 0x55 | ||
32 | #define dstByte 0xBB | ||
33 | |||
34 | #define FUNC(s,d,n) {s,d,#n,n} | ||
35 | |||
36 | static int cpu_caps; | ||
37 | |||
38 | static char *args_parse(int argc, char *argv[]) | ||
39 | { | ||
40 | int o; | ||
41 | |||
42 | while ((o = getopt(argc, argv, "m23")) != -1) { | ||
43 | switch (o) { | ||
44 | case 'm': | ||
45 | cpu_caps |= SWS_CPU_CAPS_MMX; | ||
46 | break; | ||
47 | case '2': | ||
48 | cpu_caps |= SWS_CPU_CAPS_MMX2; | ||
49 | break; | ||
50 | case '3': | ||
51 | cpu_caps |= SWS_CPU_CAPS_3DNOW; | ||
52 | break; | ||
53 | default: | ||
54 | av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o); | ||
55 | } | ||
56 | } | ||
57 | |||
58 | return argv[optind]; | ||
59 | } | ||
60 | |||
61 | int main(int argc, char **argv) | ||
62 | { | ||
63 | int i, funcNum; | ||
64 | uint8_t *srcBuffer= (uint8_t*)av_malloc(SIZE); | ||
65 | uint8_t *dstBuffer= (uint8_t*)av_malloc(SIZE); | ||
66 | int failedNum=0; | ||
67 | int passedNum=0; | ||
68 | |||
69 | av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n"); | ||
70 | args_parse(argc, argv); | ||
71 | av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps); | ||
72 | sws_rgb2rgb_init(cpu_caps); | ||
73 | |||
74 | for(funcNum=0; ; funcNum++){ | ||
75 | struct func_info_s { | ||
76 | int src_bpp; | ||
77 | int dst_bpp; | ||
78 | char *name; | ||
79 | void (*func)(const uint8_t *src, uint8_t *dst, long src_size); | ||
80 | } func_info[] = { | ||
81 | FUNC(2, 2, rgb15to16), | ||
82 | FUNC(2, 3, rgb15to24), | ||
83 | FUNC(2, 4, rgb15to32), | ||
84 | FUNC(2, 3, rgb16to24), | ||
85 | FUNC(2, 4, rgb16to32), | ||
86 | FUNC(3, 2, rgb24to15), | ||
87 | FUNC(3, 2, rgb24to16), | ||
88 | FUNC(3, 4, rgb24to32), | ||
89 | FUNC(4, 2, rgb32to15), | ||
90 | FUNC(4, 2, rgb32to16), | ||
91 | FUNC(4, 3, rgb32to24), | ||
92 | FUNC(2, 2, rgb16to15), | ||
93 | FUNC(2, 2, rgb15tobgr15), | ||
94 | FUNC(2, 2, rgb15tobgr16), | ||
95 | FUNC(2, 3, rgb15tobgr24), | ||
96 | FUNC(2, 4, rgb15tobgr32), | ||
97 | FUNC(2, 2, rgb16tobgr15), | ||
98 | FUNC(2, 2, rgb16tobgr16), | ||
99 | FUNC(2, 3, rgb16tobgr24), | ||
100 | FUNC(2, 4, rgb16tobgr32), | ||
101 | FUNC(3, 2, rgb24tobgr15), | ||
102 | FUNC(3, 2, rgb24tobgr16), | ||
103 | FUNC(3, 3, rgb24tobgr24), | ||
104 | FUNC(3, 4, rgb24tobgr32), | ||
105 | FUNC(4, 2, rgb32tobgr15), | ||
106 | FUNC(4, 2, rgb32tobgr16), | ||
107 | FUNC(4, 3, rgb32tobgr24), | ||
108 | FUNC(4, 4, rgb32tobgr32), | ||
109 | FUNC(0, 0, NULL) | ||
110 | }; | ||
111 | int width; | ||
112 | int failed=0; | ||
113 | int srcBpp=0; | ||
114 | int dstBpp=0; | ||
115 | |||
116 | if (!func_info[funcNum].func) break; | ||
117 | |||
118 | av_log(NULL, AV_LOG_INFO,"."); | ||
119 | memset(srcBuffer, srcByte, SIZE); | ||
120 | |||
121 | for(width=63; width>0; width--){ | ||
122 | int dstOffset; | ||
123 | for(dstOffset=128; dstOffset<196; dstOffset+=4){ | ||
124 | int srcOffset; | ||
125 | memset(dstBuffer, dstByte, SIZE); | ||
126 | |||
127 | for(srcOffset=128; srcOffset<196; srcOffset+=4){ | ||
128 | uint8_t *src= srcBuffer+srcOffset; | ||
129 | uint8_t *dst= dstBuffer+dstOffset; | ||
130 | char *name=NULL; | ||
131 | |||
132 | if(failed) break; //don't fill the screen with shit ... | ||
133 | |||
134 | srcBpp = func_info[funcNum].src_bpp; | ||
135 | dstBpp = func_info[funcNum].dst_bpp; | ||
136 | name = func_info[funcNum].name; | ||
137 | |||
138 | func_info[funcNum].func(src, dst, width*srcBpp); | ||
139 | |||
140 | if(!srcBpp) break; | ||
141 | |||
142 | for(i=0; i<SIZE; i++){ | ||
143 | if(srcBuffer[i]!=srcByte){ | ||
144 | av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n", | ||
145 | i, width, srcOffset, dstOffset, name); | ||
146 | failed=1; | ||
147 | break; | ||
148 | } | ||
149 | } | ||
150 | for(i=0; i<dstOffset; i++){ | ||
151 | if(dstBuffer[i]!=dstByte){ | ||
152 | av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n", | ||
153 | i, width, srcOffset, dstOffset, name); | ||
154 | failed=1; | ||
155 | break; | ||
156 | } | ||
157 | } | ||
158 | for(i=dstOffset + width*dstBpp; i<SIZE; i++){ | ||
159 | if(dstBuffer[i]!=dstByte){ | ||
160 | av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n", | ||
161 | i, width, srcOffset, dstOffset, name); | ||
162 | failed=1; | ||
163 | break; | ||
164 | } | ||
165 | } | ||
166 | } | ||
167 | } | ||
168 | } | ||
169 | if(failed) failedNum++; | ||
170 | else if(srcBpp) passedNum++; | ||
171 | } | ||
172 | |||
173 | av_log(NULL, AV_LOG_INFO, "\n%d converters passed, %d converters randomly overwrote memory\n", passedNum, failedNum); | ||
174 | return failedNum; | ||
175 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/internal_bfin.S b/src/plugins/ffmpeg/libswscale/internal_bfin.S deleted file mode 100644 index fb7bda7..0000000 --- a/src/plugins/ffmpeg/libswscale/internal_bfin.S +++ /dev/null | |||
@@ -1,606 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> | ||
3 | * April 20, 2007 | ||
4 | * | ||
5 | * Blackfin video color space converter operations | ||
6 | * convert I420 YV12 to RGB in various formats | ||
7 | * | ||
8 | * This file is part of FFmpeg. | ||
9 | * | ||
10 | * FFmpeg is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU Lesser General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2.1 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * FFmpeg is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * Lesser General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU Lesser General Public | ||
21 | * License along with FFmpeg; if not, write to the Free Software | ||
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
23 | */ | ||
24 | |||
25 | |||
26 | /* | ||
27 | YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock | ||
28 | and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts. | ||
29 | |||
30 | |||
31 | The following calculation is used for the conversion: | ||
32 | |||
33 | r = clipz((y-oy)*cy + crv*(v-128)) | ||
34 | g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) | ||
35 | b = clipz((y-oy)*cy + cbu*(u-128)) | ||
36 | |||
37 | y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision. | ||
38 | |||
39 | |||
40 | New factorization to eliminate the truncation error which was | ||
41 | occurring due to the byteop3p. | ||
42 | |||
43 | |||
44 | 1) Use the bytop16m to subtract quad bytes we use this in U8 this | ||
45 | then so the offsets need to be renormalized to 8bits. | ||
46 | |||
47 | 2) Scale operands up by a factor of 4 not 8 because Blackfin | ||
48 | multiplies include a shift. | ||
49 | |||
50 | 3) Compute into the accumulators cy*yx0, cy*yx1. | ||
51 | |||
52 | 4) Compute each of the linear equations: | ||
53 | r = clipz((y - oy) * cy + crv * (v - 128)) | ||
54 | |||
55 | g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) | ||
56 | |||
57 | b = clipz((y - oy) * cy + cbu * (u - 128)) | ||
58 | |||
59 | Reuse of the accumulators requires that we actually multiply | ||
60 | twice once with addition and the second time with a subtraction. | ||
61 | |||
62 | Because of this we need to compute the equations in the order R B | ||
63 | then G saving the writes for B in the case of 24/32 bit color | ||
64 | formats. | ||
65 | |||
66 | API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, | ||
67 | int dW, uint32_t *coeffs); | ||
68 | |||
69 | A B | ||
70 | --- --- | ||
71 | i2 = cb i3 = cr | ||
72 | i1 = coeff i0 = y | ||
73 | |||
74 | Where coeffs have the following layout in memory. | ||
75 | |||
76 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv; | ||
77 | |||
78 | coeffs is a pointer to oy. | ||
79 | |||
80 | The {rgb} masks are only utilized by the 565 packing algorithm. Note the data | ||
81 | replication is used to simplify the internal algorithms for the dual Mac | ||
82 | architecture of BlackFin. | ||
83 | |||
84 | All routines are exported with _ff_bfin_ as a symbol prefix. | ||
85 | |||
86 | Rough performance gain compared against -O3: | ||
87 | |||
88 | 2779809/1484290 187.28% | ||
89 | |||
90 | which translates to ~33c/pel to ~57c/pel for the reference vs 17.5 | ||
91 | c/pel for the optimized implementations. Not sure why there is such a | ||
92 | huge variation on the reference codes on Blackfin I guess it must have | ||
93 | to do with the memory system. | ||
94 | */ | ||
95 | |||
96 | #define mL3 .text | ||
97 | #ifdef __FDPIC__ | ||
98 | #define mL1 .l1.text | ||
99 | #else | ||
100 | #define mL1 mL3 | ||
101 | #endif | ||
102 | #define MEM mL1 | ||
103 | |||
104 | #define DEFUN(fname,where,interface) \ | ||
105 | .section where; \ | ||
106 | .global _ff_bfin_ ## fname; \ | ||
107 | .type _ff_bfin_ ## fname, STT_FUNC; \ | ||
108 | .align 8; \ | ||
109 | _ff_bfin_ ## fname | ||
110 | |||
111 | #define DEFUN_END(fname) \ | ||
112 | .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname | ||
113 | |||
114 | |||
115 | .text | ||
116 | |||
117 | #define COEFF_LEN 11*4 | ||
118 | #define COEFF_REL_CY_OFF 4*4 | ||
119 | |||
120 | #define ARG_OUT 20 | ||
121 | #define ARG_W 24 | ||
122 | #define ARG_COEFF 28 | ||
123 | |||
124 | DEFUN(yuv2rgb565_line,MEM, | ||
125 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): | ||
126 | link 0; | ||
127 | [--sp] = (r7:4); | ||
128 | p1 = [fp+ARG_OUT]; | ||
129 | r3 = [fp+ARG_W]; | ||
130 | |||
131 | i0 = r0; | ||
132 | i2 = r1; | ||
133 | i3 = r2; | ||
134 | |||
135 | r0 = [fp+ARG_COEFF]; | ||
136 | i1 = r0; | ||
137 | b1 = i1; | ||
138 | l1 = COEFF_LEN; | ||
139 | m0 = COEFF_REL_CY_OFF; | ||
140 | p0 = r3; | ||
141 | |||
142 | r0 = [i0++]; // 2Y | ||
143 | r1.l = w[i2++]; // 2u | ||
144 | r1.h = w[i3++]; // 2v | ||
145 | p0 = p0>>2; | ||
146 | |||
147 | lsetup (.L0565, .L1565) lc0 = p0; | ||
148 | |||
149 | /* | ||
150 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv | ||
151 | r0 -- used to load 4ys | ||
152 | r1 -- used to load 2us,2vs | ||
153 | r4 -- y3,y2 | ||
154 | r5 -- y1,y0 | ||
155 | r6 -- u1,u0 | ||
156 | r7 -- v1,v0 | ||
157 | */ | ||
158 | r2=[i1++]; // oy | ||
159 | .L0565: | ||
160 | /* | ||
161 | rrrrrrrr gggggggg bbbbbbbb | ||
162 | 5432109876543210 | ||
163 | bbbbb >>3 | ||
164 | gggggggg <<3 | ||
165 | rrrrrrrr <<8 | ||
166 | rrrrrggggggbbbbb | ||
167 | */ | ||
168 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc | ||
169 | (r7,r6) = byteop16m (r1:0, r3:2) (r); | ||
170 | r5 = r5 << 2 (v); // y1,y0 | ||
171 | r4 = r4 << 2 (v); // y3,y2 | ||
172 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero | ||
173 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy | ||
174 | /* Y' = y*cy */ | ||
175 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv | ||
176 | |||
177 | /* R = Y+ crv*(Cr-128) */ | ||
178 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | ||
179 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask | ||
180 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | ||
181 | r2 = r2 >> 3 (v); | ||
182 | r3 = r2 & r5; | ||
183 | |||
184 | /* B = Y+ cbu*(Cb-128) */ | ||
185 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); | ||
186 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask | ||
187 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | ||
188 | r2 = r2 << 8 (v); | ||
189 | r2 = r2 & r5; | ||
190 | r3 = r3 | r2; | ||
191 | |||
192 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | ||
193 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv | ||
194 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | ||
195 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask | ||
196 | r2 = r2 << 3 (v); | ||
197 | r2 = r2 & r5; | ||
198 | r3 = r3 | r2; | ||
199 | [p1++]=r3 || r1=[i1++]; // cy | ||
200 | |||
201 | /* Y' = y*cy */ | ||
202 | |||
203 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv | ||
204 | |||
205 | /* R = Y+ crv*(Cr-128) */ | ||
206 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | ||
207 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask | ||
208 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | ||
209 | r2 = r2 >> 3 (v); | ||
210 | r3 = r2 & r5; | ||
211 | |||
212 | /* B = Y+ cbu*(Cb-128) */ | ||
213 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); | ||
214 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask | ||
215 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | ||
216 | r2 = r2 << 8 (v); | ||
217 | r2 = r2 & r5; | ||
218 | r3 = r3 | r2; | ||
219 | |||
220 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | ||
221 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv | ||
222 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask | ||
223 | r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y | ||
224 | r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u | ||
225 | r2 = r2 & r5; | ||
226 | r3 = r3 | r2; | ||
227 | [p1++]=r3 || r1.h = w[i3++]; // 2v | ||
228 | .L1565: r2=[i1++]; // oy | ||
229 | |||
230 | l1 = 0; | ||
231 | |||
232 | (r7:4) = [sp++]; | ||
233 | unlink; | ||
234 | rts; | ||
235 | DEFUN_END(yuv2rgb565_line) | ||
236 | |||
237 | DEFUN(yuv2rgb555_line,MEM, | ||
238 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): | ||
239 | link 0; | ||
240 | [--sp] = (r7:4); | ||
241 | p1 = [fp+ARG_OUT]; | ||
242 | r3 = [fp+ARG_W]; | ||
243 | |||
244 | i0 = r0; | ||
245 | i2 = r1; | ||
246 | i3 = r2; | ||
247 | |||
248 | r0 = [fp+ARG_COEFF]; | ||
249 | i1 = r0; | ||
250 | b1 = i1; | ||
251 | l1 = COEFF_LEN; | ||
252 | m0 = COEFF_REL_CY_OFF; | ||
253 | p0 = r3; | ||
254 | |||
255 | r0 = [i0++]; // 2Y | ||
256 | r1.l = w[i2++]; // 2u | ||
257 | r1.h = w[i3++]; // 2v | ||
258 | p0 = p0>>2; | ||
259 | |||
260 | lsetup (.L0555, .L1555) lc0 = p0; | ||
261 | |||
262 | /* | ||
263 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv | ||
264 | r0 -- used to load 4ys | ||
265 | r1 -- used to load 2us,2vs | ||
266 | r4 -- y3,y2 | ||
267 | r5 -- y1,y0 | ||
268 | r6 -- u1,u0 | ||
269 | r7 -- v1,v0 | ||
270 | */ | ||
271 | r2=[i1++]; // oy | ||
272 | .L0555: | ||
273 | /* | ||
274 | rrrrrrrr gggggggg bbbbbbbb | ||
275 | 5432109876543210 | ||
276 | bbbbb >>3 | ||
277 | gggggggg <<2 | ||
278 | rrrrrrrr <<7 | ||
279 | xrrrrrgggggbbbbb | ||
280 | */ | ||
281 | |||
282 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc | ||
283 | (r7,r6) = byteop16m (r1:0, r3:2) (r); | ||
284 | r5 = r5 << 2 (v); // y1,y0 | ||
285 | r4 = r4 << 2 (v); // y3,y2 | ||
286 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero | ||
287 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy | ||
288 | /* Y' = y*cy */ | ||
289 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv | ||
290 | |||
291 | /* R = Y+ crv*(Cr-128) */ | ||
292 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | ||
293 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask | ||
294 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | ||
295 | r2 = r2 >> 3 (v); | ||
296 | r3 = r2 & r5; | ||
297 | |||
298 | /* B = Y+ cbu*(Cb-128) */ | ||
299 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); | ||
300 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask | ||
301 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | ||
302 | r2 = r2 << 7 (v); | ||
303 | r2 = r2 & r5; | ||
304 | r3 = r3 | r2; | ||
305 | |||
306 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | ||
307 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv | ||
308 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | ||
309 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask | ||
310 | r2 = r2 << 2 (v); | ||
311 | r2 = r2 & r5; | ||
312 | r3 = r3 | r2; | ||
313 | [p1++]=r3 || r1=[i1++]; // cy | ||
314 | |||
315 | /* Y' = y*cy */ | ||
316 | |||
317 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv | ||
318 | |||
319 | /* R = Y+ crv*(Cr-128) */ | ||
320 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | ||
321 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask | ||
322 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | ||
323 | r2 = r2 >> 3 (v); | ||
324 | r3 = r2 & r5; | ||
325 | |||
326 | /* B = Y+ cbu*(Cb-128) */ | ||
327 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); | ||
328 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask | ||
329 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | ||
330 | r2 = r2 << 7 (v); | ||
331 | r2 = r2 & r5; | ||
332 | r3 = r3 | r2; | ||
333 | |||
334 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | ||
335 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv | ||
336 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask | ||
337 | r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y | ||
338 | r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u | ||
339 | r2 = r2 & r5; | ||
340 | r3 = r3 | r2; | ||
341 | [p1++]=r3 || r1.h=w[i3++]; // 2v | ||
342 | |||
343 | .L1555: r2=[i1++]; // oy | ||
344 | |||
345 | l1 = 0; | ||
346 | |||
347 | (r7:4) = [sp++]; | ||
348 | unlink; | ||
349 | rts; | ||
350 | DEFUN_END(yuv2rgb555_line) | ||
351 | |||
352 | DEFUN(yuv2rgb24_line,MEM, | ||
353 | (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)): | ||
354 | link 0; | ||
355 | [--sp] = (r7:4); | ||
356 | p1 = [fp+ARG_OUT]; | ||
357 | r3 = [fp+ARG_W]; | ||
358 | p2 = p1; | ||
359 | p2 += 3; | ||
360 | |||
361 | i0 = r0; | ||
362 | i2 = r1; | ||
363 | i3 = r2; | ||
364 | |||
365 | r0 = [fp+ARG_COEFF]; // coeff buffer | ||
366 | i1 = r0; | ||
367 | b1 = i1; | ||
368 | l1 = COEFF_LEN; | ||
369 | m0 = COEFF_REL_CY_OFF; | ||
370 | p0 = r3; | ||
371 | |||
372 | r0 = [i0++]; // 2Y | ||
373 | r1.l = w[i2++]; // 2u | ||
374 | r1.h = w[i3++]; // 2v | ||
375 | p0 = p0>>2; | ||
376 | |||
377 | lsetup (.L0888, .L1888) lc0 = p0; | ||
378 | |||
379 | /* | ||
380 | uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv | ||
381 | r0 -- used to load 4ys | ||
382 | r1 -- used to load 2us,2vs | ||
383 | r4 -- y3,y2 | ||
384 | r5 -- y1,y0 | ||
385 | r6 -- u1,u0 | ||
386 | r7 -- v1,v0 | ||
387 | */ | ||
388 | r2=[i1++]; // oy | ||
389 | .L0888: | ||
390 | (r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc | ||
391 | (r7,r6) = byteop16m (r1:0, r3:2) (r); | ||
392 | r5 = r5 << 2 (v); // y1,y0 | ||
393 | r4 = r4 << 2 (v); // y3,y2 | ||
394 | r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero | ||
395 | r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy | ||
396 | |||
397 | /* Y' = y*cy */ | ||
398 | a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv | ||
399 | |||
400 | /* R = Y+ crv*(Cr-128) */ | ||
401 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | ||
402 | a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask | ||
403 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | ||
404 | r2=r2>>16 || B[p1++]=r2; | ||
405 | B[p2++]=r2; | ||
406 | |||
407 | /* B = Y+ cbu*(Cb-128) */ | ||
408 | r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l); | ||
409 | a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask | ||
410 | r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | ||
411 | |||
412 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | ||
413 | a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv | ||
414 | r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l); | ||
415 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero | ||
416 | |||
417 | r2=r2>>16 || B[p1++]=r2; | ||
418 | B[p2++]=r2; | ||
419 | |||
420 | r3=r3>>16 || B[p1++]=r3; | ||
421 | B[p2++]=r3 || r1=[i1++]; // cy | ||
422 | |||
423 | p1+=3; | ||
424 | p2+=3; | ||
425 | /* Y' = y*cy */ | ||
426 | a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv | ||
427 | |||
428 | /* R = Y+ crv*(Cr-128) */ | ||
429 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | ||
430 | a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask | ||
431 | r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu | ||
432 | r2=r2>>16 || B[p1++]=r2; | ||
433 | B[p2++]=r2; | ||
434 | |||
435 | /* B = Y+ cbu*(Cb-128) */ | ||
436 | r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h); | ||
437 | a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask | ||
438 | r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu | ||
439 | |||
440 | /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */ | ||
441 | a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv | ||
442 | r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h); | ||
443 | r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask | ||
444 | r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y | ||
445 | B[p2++]=r2 || r1.l = w[i2++]; // 2u | ||
446 | r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v | ||
447 | B[p2++]=r3 || r2=[i1++]; // oy | ||
448 | |||
449 | p1+=3; | ||
450 | .L1888: p2+=3; | ||
451 | |||
452 | l1 = 0; | ||
453 | |||
454 | (r7:4) = [sp++]; | ||
455 | unlink; | ||
456 | rts; | ||
457 | DEFUN_END(yuv2rgb24_line) | ||
458 | |||
459 | |||
460 | |||
461 | #define ARG_vdst 20 | ||
462 | #define ARG_width 24 | ||
463 | #define ARG_height 28 | ||
464 | #define ARG_lumStride 32 | ||
465 | #define ARG_chromStride 36 | ||
466 | #define ARG_srcStride 40 | ||
467 | |||
468 | DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
469 | long width, long height, | ||
470 | long lumStride, long chromStride, long srcStride)): | ||
471 | link 0; | ||
472 | [--sp] = (r7:4,p5:4); | ||
473 | |||
474 | p0 = r1; // Y top even | ||
475 | |||
476 | i2 = r2; // *u | ||
477 | r2 = [fp + ARG_vdst]; | ||
478 | i3 = r2; // *v | ||
479 | |||
480 | r1 = [fp + ARG_srcStride]; | ||
481 | r2 = r0 + r1; | ||
482 | r1 += -8; // i0,i1 is pre read need to correct | ||
483 | m0 = r1; | ||
484 | |||
485 | i0 = r0; // uyvy_T even | ||
486 | i1 = r2; // uyvy_B odd | ||
487 | |||
488 | p2 = [fp + ARG_lumStride]; | ||
489 | p1 = p0 + p2; // Y bot odd | ||
490 | |||
491 | p5 = [fp + ARG_width]; | ||
492 | p4 = [fp + ARG_height]; | ||
493 | r0 = p5; | ||
494 | p4 = p4 >> 1; | ||
495 | p5 = p5 >> 2; | ||
496 | |||
497 | r2 = [fp + ARG_chromStride]; | ||
498 | r0 = r0 >> 1; | ||
499 | r2 = r2 - r0; | ||
500 | m1 = r2; | ||
501 | |||
502 | /* I0,I1 - src input line pointers | ||
503 | * p0,p1 - luma output line pointers | ||
504 | * I2 - dstU | ||
505 | * I3 - dstV | ||
506 | */ | ||
507 | |||
508 | lsetup (0f, 1f) lc1 = p4; // H/2 | ||
509 | 0: r0 = [i0++] || r2 = [i1++]; | ||
510 | r1 = [i0++] || r3 = [i1++]; | ||
511 | r4 = byteop1p(r1:0, r3:2); | ||
512 | r5 = byteop1p(r1:0, r3:2) (r); | ||
513 | lsetup (2f, 3f) lc0 = p5; // W/4 | ||
514 | 2: r0 = r0 >> 8(v); | ||
515 | r1 = r1 >> 8(v); | ||
516 | r2 = r2 >> 8(v); | ||
517 | r3 = r3 >> 8(v); | ||
518 | r0 = bytepack(r0, r1); | ||
519 | r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy | ||
520 | r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy | ||
521 | r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; | ||
522 | r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; | ||
523 | r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu | ||
524 | 3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv | ||
525 | |||
526 | i0 += m0; | ||
527 | i1 += m0; | ||
528 | i2 += m1; | ||
529 | i3 += m1; | ||
530 | p0 = p0 + p2; | ||
531 | 1: p1 = p1 + p2; | ||
532 | |||
533 | (r7:4,p5:4) = [sp++]; | ||
534 | unlink; | ||
535 | rts; | ||
536 | DEFUN_END(uyvytoyv12) | ||
537 | |||
538 | DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
539 | long width, long height, | ||
540 | long lumStride, long chromStride, long srcStride)): | ||
541 | link 0; | ||
542 | [--sp] = (r7:4,p5:4); | ||
543 | |||
544 | p0 = r1; // Y top even | ||
545 | |||
546 | i2 = r2; // *u | ||
547 | r2 = [fp + ARG_vdst]; | ||
548 | i3 = r2; // *v | ||
549 | |||
550 | r1 = [fp + ARG_srcStride]; | ||
551 | r2 = r0 + r1; | ||
552 | r1 += -8; // i0,i1 is pre read need to correct | ||
553 | m0 = r1; | ||
554 | |||
555 | i0 = r0; // uyvy_T even | ||
556 | i1 = r2; // uyvy_B odd | ||
557 | |||
558 | p2 = [fp + ARG_lumStride]; | ||
559 | p1 = p0 + p2; // Y bot odd | ||
560 | |||
561 | p5 = [fp + ARG_width]; | ||
562 | p4 = [fp + ARG_height]; | ||
563 | r0 = p5; | ||
564 | p4 = p4 >> 1; | ||
565 | p5 = p5 >> 2; | ||
566 | |||
567 | r2 = [fp + ARG_chromStride]; | ||
568 | r0 = r0 >> 1; | ||
569 | r2 = r2 - r0; | ||
570 | m1 = r2; | ||
571 | |||
572 | /* I0,I1 - src input line pointers | ||
573 | * p0,p1 - luma output line pointers | ||
574 | * I2 - dstU | ||
575 | * I3 - dstV | ||
576 | */ | ||
577 | |||
578 | lsetup (0f, 1f) lc1 = p4; // H/2 | ||
579 | 0: r0 = [i0++] || r2 = [i1++]; | ||
580 | r1 = [i0++] || r3 = [i1++]; | ||
581 | r4 = bytepack(r0, r1); | ||
582 | r5 = bytepack(r2, r3); | ||
583 | lsetup (2f, 3f) lc0 = p5; // W/4 | ||
584 | 2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even | ||
585 | r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd | ||
586 | r2 = r2 >> 8(v); | ||
587 | r3 = r3 >> 8(v); | ||
588 | r4 = byteop1p(r1:0, r3:2); | ||
589 | r5 = byteop1p(r1:0, r3:2) (r); | ||
590 | r6 = pack(r5.l, r4.l); | ||
591 | r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++]; | ||
592 | r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++]; | ||
593 | r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu | ||
594 | 3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv | ||
595 | |||
596 | i0 += m0; | ||
597 | i1 += m0; | ||
598 | i2 += m1; | ||
599 | i3 += m1; | ||
600 | p0 = p0 + p2; | ||
601 | 1: p1 = p1 + p2; | ||
602 | |||
603 | (r7:4,p5:4) = [sp++]; | ||
604 | unlink; | ||
605 | rts; | ||
606 | DEFUN_END(yuyvtoyv12) | ||
diff --git a/src/plugins/ffmpeg/libswscale/rgb2rgb.c b/src/plugins/ffmpeg/libswscale/rgb2rgb.c deleted file mode 100644 index 14c4070..0000000 --- a/src/plugins/ffmpeg/libswscale/rgb2rgb.c +++ /dev/null | |||
@@ -1,534 +0,0 @@ | |||
1 | /* | ||
2 | * software RGB to RGB converter | ||
3 | * pluralize by software PAL8 to RGB converter | ||
4 | * software YUV to YUV converter | ||
5 | * software YUV to RGB converter | ||
6 | * Written by Nick Kurshev. | ||
7 | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) | ||
8 | * | ||
9 | * This file is part of FFmpeg. | ||
10 | * | ||
11 | * FFmpeg is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * FFmpeg is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with FFmpeg; if not, write to the Free Software | ||
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
24 | * | ||
25 | * The C code (not assembly, MMX, ...) of this file can be used | ||
26 | * under the LGPL license. | ||
27 | */ | ||
28 | #include <inttypes.h> | ||
29 | #include "config.h" | ||
30 | #include "libavutil/x86_cpu.h" | ||
31 | #include "libavutil/bswap.h" | ||
32 | #include "rgb2rgb.h" | ||
33 | #include "swscale.h" | ||
34 | #include "swscale_internal.h" | ||
35 | |||
36 | #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients | ||
37 | |||
38 | void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size); | ||
39 | void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
40 | void (*rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
41 | void (*rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size); | ||
42 | void (*rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
43 | void (*rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
44 | void (*rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
45 | void (*rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size); | ||
46 | void (*rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size); | ||
47 | void (*rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
48 | void (*rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size); | ||
49 | void (*rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size); | ||
50 | //void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size); | ||
51 | void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size); | ||
52 | void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
53 | void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
54 | void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size); | ||
55 | //void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size); | ||
56 | void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
57 | void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
58 | |||
59 | void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
60 | long width, long height, | ||
61 | long lumStride, long chromStride, long dstStride); | ||
62 | void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
63 | long width, long height, | ||
64 | long lumStride, long chromStride, long dstStride); | ||
65 | void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
66 | long width, long height, | ||
67 | long lumStride, long chromStride, long dstStride); | ||
68 | void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
69 | long width, long height, | ||
70 | long lumStride, long chromStride, long srcStride); | ||
71 | void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
72 | long width, long height, | ||
73 | long lumStride, long chromStride, long srcStride); | ||
74 | void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height, | ||
75 | long srcStride, long dstStride); | ||
76 | void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst, | ||
77 | long width, long height, long src1Stride, | ||
78 | long src2Stride, long dstStride); | ||
79 | void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, | ||
80 | uint8_t *dst1, uint8_t *dst2, | ||
81 | long width, long height, | ||
82 | long srcStride1, long srcStride2, | ||
83 | long dstStride1, long dstStride2); | ||
84 | void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | ||
85 | uint8_t *dst, | ||
86 | long width, long height, | ||
87 | long srcStride1, long srcStride2, | ||
88 | long srcStride3, long dstStride); | ||
89 | |||
90 | #if defined(ARCH_X86) && defined(CONFIG_GPL) | ||
91 | DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; | ||
92 | DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL; | ||
93 | DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL; | ||
94 | DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL; | ||
95 | DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL; | ||
96 | DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL; | ||
97 | DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL; | ||
98 | DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL; | ||
99 | DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL; | ||
100 | DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL; | ||
101 | DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL; | ||
102 | DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL; | ||
103 | DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL; | ||
104 | DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL; | ||
105 | DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL; | ||
106 | DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL; | ||
107 | DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL; | ||
108 | DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL; | ||
109 | DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL; | ||
110 | DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ | ||
111 | DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ | ||
112 | DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL; | ||
113 | DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL; | ||
114 | DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL; | ||
115 | #define mask16b mask15b | ||
116 | DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL; | ||
117 | DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL; | ||
118 | DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL; | ||
119 | DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL; | ||
120 | DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; | ||
121 | DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; | ||
122 | DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; | ||
123 | DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL; | ||
124 | |||
125 | #if 0 | ||
126 | static volatile uint64_t __attribute__((aligned(8))) b5Dither; | ||
127 | static volatile uint64_t __attribute__((aligned(8))) g5Dither; | ||
128 | static volatile uint64_t __attribute__((aligned(8))) g6Dither; | ||
129 | static volatile uint64_t __attribute__((aligned(8))) r5Dither; | ||
130 | |||
131 | static uint64_t __attribute__((aligned(8))) dither4[2]={ | ||
132 | 0x0103010301030103LL, | ||
133 | 0x0200020002000200LL,}; | ||
134 | |||
135 | static uint64_t __attribute__((aligned(8))) dither8[2]={ | ||
136 | 0x0602060206020602LL, | ||
137 | 0x0004000400040004LL,}; | ||
138 | #endif | ||
139 | #endif /* defined(ARCH_X86) */ | ||
140 | |||
141 | #define RGB2YUV_SHIFT 8 | ||
142 | #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) | ||
143 | #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) | ||
144 | #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
145 | #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) | ||
146 | #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) | ||
147 | #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) | ||
148 | #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) | ||
149 | #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
150 | #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) | ||
151 | |||
152 | //Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one. | ||
153 | //plain C versions | ||
154 | #undef HAVE_MMX | ||
155 | #undef HAVE_MMX2 | ||
156 | #undef HAVE_3DNOW | ||
157 | #undef HAVE_SSE2 | ||
158 | #define RENAME(a) a ## _C | ||
159 | #include "rgb2rgb_template.c" | ||
160 | |||
161 | #if defined(ARCH_X86) && defined(CONFIG_GPL) | ||
162 | |||
163 | //MMX versions | ||
164 | #undef RENAME | ||
165 | #define HAVE_MMX | ||
166 | #undef HAVE_MMX2 | ||
167 | #undef HAVE_3DNOW | ||
168 | #undef HAVE_SSE2 | ||
169 | #define RENAME(a) a ## _MMX | ||
170 | #include "rgb2rgb_template.c" | ||
171 | |||
172 | //MMX2 versions | ||
173 | #undef RENAME | ||
174 | #define HAVE_MMX | ||
175 | #define HAVE_MMX2 | ||
176 | #undef HAVE_3DNOW | ||
177 | #undef HAVE_SSE2 | ||
178 | #define RENAME(a) a ## _MMX2 | ||
179 | #include "rgb2rgb_template.c" | ||
180 | |||
181 | //3DNOW versions | ||
182 | #undef RENAME | ||
183 | #define HAVE_MMX | ||
184 | #undef HAVE_MMX2 | ||
185 | #define HAVE_3DNOW | ||
186 | #undef HAVE_SSE2 | ||
187 | #define RENAME(a) a ## _3DNOW | ||
188 | #include "rgb2rgb_template.c" | ||
189 | |||
190 | #endif //ARCH_X86 || ARCH_X86_64 | ||
191 | |||
192 | /* | ||
193 | RGB15->RGB16 original by Strepto/Astral | ||
194 | ported to gcc & bugfixed : A'rpi | ||
195 | MMX2, 3DNOW optimization by Nick Kurshev | ||
196 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
197 | */ | ||
198 | |||
199 | void sws_rgb2rgb_init(int flags){ | ||
200 | #if (defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX)) && defined(CONFIG_GPL) | ||
201 | if (flags & SWS_CPU_CAPS_MMX2) | ||
202 | rgb2rgb_init_MMX2(); | ||
203 | else if (flags & SWS_CPU_CAPS_3DNOW) | ||
204 | rgb2rgb_init_3DNOW(); | ||
205 | else if (flags & SWS_CPU_CAPS_MMX) | ||
206 | rgb2rgb_init_MMX(); | ||
207 | else | ||
208 | #endif /* defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX) */ | ||
209 | rgb2rgb_init_C(); | ||
210 | } | ||
211 | |||
212 | /** | ||
213 | * Palette is assumed to contain BGR32. | ||
214 | */ | ||
215 | void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
216 | { | ||
217 | long i; | ||
218 | |||
219 | /* | ||
220 | for (i=0; i<num_pixels; i++) | ||
221 | ((unsigned *)dst)[i] = ((unsigned *)palette)[src[i]]; | ||
222 | */ | ||
223 | |||
224 | for (i=0; i<num_pixels; i++) | ||
225 | { | ||
226 | #ifdef WORDS_BIGENDIAN | ||
227 | dst[3]= palette[src[i]*4+2]; | ||
228 | dst[2]= palette[src[i]*4+1]; | ||
229 | dst[1]= palette[src[i]*4+0]; | ||
230 | #else | ||
231 | //FIXME slow? | ||
232 | dst[0]= palette[src[i]*4+2]; | ||
233 | dst[1]= palette[src[i]*4+1]; | ||
234 | dst[2]= palette[src[i]*4+0]; | ||
235 | //dst[3]= 0; /* do we need this cleansing? */ | ||
236 | #endif | ||
237 | dst+= 4; | ||
238 | } | ||
239 | } | ||
240 | |||
241 | void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
242 | { | ||
243 | long i; | ||
244 | for (i=0; i<num_pixels; i++) | ||
245 | { | ||
246 | #ifdef WORDS_BIGENDIAN | ||
247 | dst[3]= palette[src[i]*4+0]; | ||
248 | dst[2]= palette[src[i]*4+1]; | ||
249 | dst[1]= palette[src[i]*4+2]; | ||
250 | #else | ||
251 | //FIXME slow? | ||
252 | dst[0]= palette[src[i]*4+0]; | ||
253 | dst[1]= palette[src[i]*4+1]; | ||
254 | dst[2]= palette[src[i]*4+2]; | ||
255 | //dst[3]= 0; /* do we need this cleansing? */ | ||
256 | #endif | ||
257 | |||
258 | dst+= 4; | ||
259 | } | ||
260 | } | ||
261 | |||
262 | /** | ||
263 | * Palette is assumed to contain BGR32. | ||
264 | */ | ||
265 | void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
266 | { | ||
267 | long i; | ||
268 | /* | ||
269 | Writes 1 byte too much and might cause alignment issues on some architectures? | ||
270 | for (i=0; i<num_pixels; i++) | ||
271 | ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; | ||
272 | */ | ||
273 | for (i=0; i<num_pixels; i++) | ||
274 | { | ||
275 | //FIXME slow? | ||
276 | dst[0]= palette[src[i]*4+2]; | ||
277 | dst[1]= palette[src[i]*4+1]; | ||
278 | dst[2]= palette[src[i]*4+0]; | ||
279 | dst+= 3; | ||
280 | } | ||
281 | } | ||
282 | |||
283 | void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
284 | { | ||
285 | long i; | ||
286 | /* | ||
287 | Writes 1 byte too much and might cause alignment issues on some architectures? | ||
288 | for (i=0; i<num_pixels; i++) | ||
289 | ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; | ||
290 | */ | ||
291 | for (i=0; i<num_pixels; i++) | ||
292 | { | ||
293 | //FIXME slow? | ||
294 | dst[0]= palette[src[i]*4+0]; | ||
295 | dst[1]= palette[src[i]*4+1]; | ||
296 | dst[2]= palette[src[i]*4+2]; | ||
297 | dst+= 3; | ||
298 | } | ||
299 | } | ||
300 | |||
301 | /** | ||
302 | * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette. | ||
303 | */ | ||
304 | void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
305 | { | ||
306 | long i; | ||
307 | for (i=0; i<num_pixels; i++) | ||
308 | ((uint16_t *)dst)[i] = ((const uint16_t *)palette)[src[i]]; | ||
309 | } | ||
310 | void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
311 | { | ||
312 | long i; | ||
313 | for (i=0; i<num_pixels; i++) | ||
314 | ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]); | ||
315 | } | ||
316 | |||
317 | /** | ||
318 | * Palette is assumed to contain BGR15, see rgb32to15 to convert the palette. | ||
319 | */ | ||
320 | void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
321 | { | ||
322 | long i; | ||
323 | for (i=0; i<num_pixels; i++) | ||
324 | ((uint16_t *)dst)[i] = ((const uint16_t *)palette)[src[i]]; | ||
325 | } | ||
326 | void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) | ||
327 | { | ||
328 | long i; | ||
329 | for (i=0; i<num_pixels; i++) | ||
330 | ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]); | ||
331 | } | ||
332 | |||
333 | void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size) | ||
334 | { | ||
335 | long i; | ||
336 | long num_pixels = src_size >> 2; | ||
337 | for (i=0; i<num_pixels; i++) | ||
338 | { | ||
339 | #ifdef WORDS_BIGENDIAN | ||
340 | /* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */ | ||
341 | dst[3*i + 0] = src[4*i + 1]; | ||
342 | dst[3*i + 1] = src[4*i + 2]; | ||
343 | dst[3*i + 2] = src[4*i + 3]; | ||
344 | #else | ||
345 | dst[3*i + 0] = src[4*i + 2]; | ||
346 | dst[3*i + 1] = src[4*i + 1]; | ||
347 | dst[3*i + 2] = src[4*i + 0]; | ||
348 | #endif | ||
349 | } | ||
350 | } | ||
351 | |||
352 | void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size) | ||
353 | { | ||
354 | long i; | ||
355 | for (i=0; 3*i<src_size; i++) | ||
356 | { | ||
357 | #ifdef WORDS_BIGENDIAN | ||
358 | /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */ | ||
359 | dst[4*i + 0] = 0; | ||
360 | dst[4*i + 1] = src[3*i + 0]; | ||
361 | dst[4*i + 2] = src[3*i + 1]; | ||
362 | dst[4*i + 3] = src[3*i + 2]; | ||
363 | #else | ||
364 | dst[4*i + 0] = src[3*i + 2]; | ||
365 | dst[4*i + 1] = src[3*i + 1]; | ||
366 | dst[4*i + 2] = src[3*i + 0]; | ||
367 | dst[4*i + 3] = 0; | ||
368 | #endif | ||
369 | } | ||
370 | } | ||
371 | |||
372 | void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size) | ||
373 | { | ||
374 | const uint16_t *end; | ||
375 | uint8_t *d = dst; | ||
376 | const uint16_t *s = (const uint16_t *)src; | ||
377 | end = s + src_size/2; | ||
378 | while (s < end) | ||
379 | { | ||
380 | register uint16_t bgr; | ||
381 | bgr = *s++; | ||
382 | #ifdef WORDS_BIGENDIAN | ||
383 | *d++ = 0; | ||
384 | *d++ = (bgr&0x1F)<<3; | ||
385 | *d++ = (bgr&0x7E0)>>3; | ||
386 | *d++ = (bgr&0xF800)>>8; | ||
387 | #else | ||
388 | *d++ = (bgr&0xF800)>>8; | ||
389 | *d++ = (bgr&0x7E0)>>3; | ||
390 | *d++ = (bgr&0x1F)<<3; | ||
391 | *d++ = 0; | ||
392 | #endif | ||
393 | } | ||
394 | } | ||
395 | |||
396 | void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size) | ||
397 | { | ||
398 | const uint16_t *end; | ||
399 | uint8_t *d = dst; | ||
400 | const uint16_t *s = (const uint16_t *)src; | ||
401 | end = s + src_size/2; | ||
402 | while (s < end) | ||
403 | { | ||
404 | register uint16_t bgr; | ||
405 | bgr = *s++; | ||
406 | *d++ = (bgr&0xF800)>>8; | ||
407 | *d++ = (bgr&0x7E0)>>3; | ||
408 | *d++ = (bgr&0x1F)<<3; | ||
409 | } | ||
410 | } | ||
411 | |||
412 | void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size) | ||
413 | { | ||
414 | long i; | ||
415 | long num_pixels = src_size >> 1; | ||
416 | |||
417 | for (i=0; i<num_pixels; i++) | ||
418 | { | ||
419 | unsigned b,g,r; | ||
420 | register uint16_t rgb; | ||
421 | rgb = src[2*i]; | ||
422 | r = rgb&0x1F; | ||
423 | g = (rgb&0x7E0)>>5; | ||
424 | b = (rgb&0xF800)>>11; | ||
425 | dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11); | ||
426 | } | ||
427 | } | ||
428 | |||
429 | void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size) | ||
430 | { | ||
431 | long i; | ||
432 | long num_pixels = src_size >> 1; | ||
433 | |||
434 | for (i=0; i<num_pixels; i++) | ||
435 | { | ||
436 | unsigned b,g,r; | ||
437 | register uint16_t rgb; | ||
438 | rgb = src[2*i]; | ||
439 | r = rgb&0x1F; | ||
440 | g = (rgb&0x7E0)>>5; | ||
441 | b = (rgb&0xF800)>>11; | ||
442 | dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10); | ||
443 | } | ||
444 | } | ||
445 | |||
446 | void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size) | ||
447 | { | ||
448 | const uint16_t *end; | ||
449 | uint8_t *d = dst; | ||
450 | const uint16_t *s = (const uint16_t *)src; | ||
451 | end = s + src_size/2; | ||
452 | while (s < end) | ||
453 | { | ||
454 | register uint16_t bgr; | ||
455 | bgr = *s++; | ||
456 | #ifdef WORDS_BIGENDIAN | ||
457 | *d++ = 0; | ||
458 | *d++ = (bgr&0x1F)<<3; | ||
459 | *d++ = (bgr&0x3E0)>>2; | ||
460 | *d++ = (bgr&0x7C00)>>7; | ||
461 | #else | ||
462 | *d++ = (bgr&0x7C00)>>7; | ||
463 | *d++ = (bgr&0x3E0)>>2; | ||
464 | *d++ = (bgr&0x1F)<<3; | ||
465 | *d++ = 0; | ||
466 | #endif | ||
467 | } | ||
468 | } | ||
469 | |||
470 | void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size) | ||
471 | { | ||
472 | const uint16_t *end; | ||
473 | uint8_t *d = dst; | ||
474 | const uint16_t *s = (const uint16_t *)src; | ||
475 | end = s + src_size/2; | ||
476 | while (s < end) | ||
477 | { | ||
478 | register uint16_t bgr; | ||
479 | bgr = *s++; | ||
480 | *d++ = (bgr&0x7C00)>>7; | ||
481 | *d++ = (bgr&0x3E0)>>2; | ||
482 | *d++ = (bgr&0x1F)<<3; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size) | ||
487 | { | ||
488 | long i; | ||
489 | long num_pixels = src_size >> 1; | ||
490 | |||
491 | for (i=0; i<num_pixels; i++) | ||
492 | { | ||
493 | unsigned b,g,r; | ||
494 | register uint16_t rgb; | ||
495 | rgb = src[2*i]; | ||
496 | r = rgb&0x1F; | ||
497 | g = (rgb&0x3E0)>>5; | ||
498 | b = (rgb&0x7C00)>>10; | ||
499 | dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11); | ||
500 | } | ||
501 | } | ||
502 | |||
503 | void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size) | ||
504 | { | ||
505 | long i; | ||
506 | long num_pixels = src_size >> 1; | ||
507 | |||
508 | for (i=0; i<num_pixels; i++) | ||
509 | { | ||
510 | unsigned b,g,r; | ||
511 | register uint16_t rgb; | ||
512 | rgb = src[2*i]; | ||
513 | r = rgb&0x1F; | ||
514 | g = (rgb&0x3E0)>>5; | ||
515 | b = (rgb&0x7C00)>>10; | ||
516 | dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10); | ||
517 | } | ||
518 | } | ||
519 | |||
520 | void rgb8tobgr8(const uint8_t *src, uint8_t *dst, long src_size) | ||
521 | { | ||
522 | long i; | ||
523 | long num_pixels = src_size; | ||
524 | for (i=0; i<num_pixels; i++) | ||
525 | { | ||
526 | unsigned b,g,r; | ||
527 | register uint8_t rgb; | ||
528 | rgb = src[i]; | ||
529 | r = (rgb&0x07); | ||
530 | g = (rgb&0x38)>>3; | ||
531 | b = (rgb&0xC0)>>6; | ||
532 | dst[i] = ((b<<1)&0x07) | ((g&0x07)<<3) | ((r&0x03)<<6); | ||
533 | } | ||
534 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/rgb2rgb.h b/src/plugins/ffmpeg/libswscale/rgb2rgb.h deleted file mode 100644 index f2697c6..0000000 --- a/src/plugins/ffmpeg/libswscale/rgb2rgb.h +++ /dev/null | |||
@@ -1,142 +0,0 @@ | |||
1 | /* | ||
2 | * software RGB to RGB converter | ||
3 | * pluralize by Software PAL8 to RGB converter | ||
4 | * Software YUV to YUV converter | ||
5 | * Software YUV to RGB converter | ||
6 | * Written by Nick Kurshev. | ||
7 | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) | ||
8 | * | ||
9 | * This file is part of FFmpeg. | ||
10 | * | ||
11 | * FFmpeg is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU Lesser General Public | ||
13 | * License as published by the Free Software Foundation; either | ||
14 | * version 2.1 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * FFmpeg is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
19 | * Lesser General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU Lesser General Public | ||
22 | * License along with FFmpeg; if not, write to the Free Software | ||
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
24 | */ | ||
25 | |||
26 | #ifndef FFMPEG_RGB2RGB_H | ||
27 | #define FFMPEG_RGB2RGB_H | ||
28 | |||
29 | #include <inttypes.h> | ||
30 | |||
31 | /* A full collection of RGB to RGB(BGR) converters */ | ||
32 | extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size); | ||
33 | extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size); | ||
34 | extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size); | ||
35 | extern void (*rgb32to24) (const uint8_t *src, uint8_t *dst, long src_size); | ||
36 | extern void (*rgb32to16) (const uint8_t *src, uint8_t *dst, long src_size); | ||
37 | extern void (*rgb32to15) (const uint8_t *src, uint8_t *dst, long src_size); | ||
38 | extern void (*rgb15to16) (const uint8_t *src, uint8_t *dst, long src_size); | ||
39 | extern void (*rgb15to24) (const uint8_t *src, uint8_t *dst, long src_size); | ||
40 | extern void (*rgb15to32) (const uint8_t *src, uint8_t *dst, long src_size); | ||
41 | extern void (*rgb16to15) (const uint8_t *src, uint8_t *dst, long src_size); | ||
42 | extern void (*rgb16to24) (const uint8_t *src, uint8_t *dst, long src_size); | ||
43 | extern void (*rgb16to32) (const uint8_t *src, uint8_t *dst, long src_size); | ||
44 | extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size); | ||
45 | extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
46 | extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
47 | extern void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size); | ||
48 | extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size); | ||
49 | extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size); | ||
50 | |||
51 | extern void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size); | ||
52 | extern void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size); | ||
53 | extern void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size); | ||
54 | extern void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size); | ||
55 | extern void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size); | ||
56 | extern void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size); | ||
57 | extern void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size); | ||
58 | extern void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size); | ||
59 | extern void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size); | ||
60 | extern void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size); | ||
61 | extern void rgb8tobgr8 (const uint8_t *src, uint8_t *dst, long src_size); | ||
62 | |||
63 | |||
64 | extern void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
65 | extern void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
66 | extern void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
67 | extern void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
68 | extern void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
69 | extern void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
70 | extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
71 | extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); | ||
72 | |||
73 | /** | ||
74 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
75 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
76 | * Chrominance data is only taken from every second line, others are ignored. | ||
77 | * FIXME: Write HQ version. | ||
78 | */ | ||
79 | //void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
80 | |||
81 | /** | ||
82 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
83 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
84 | */ | ||
85 | extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
86 | long width, long height, | ||
87 | long lumStride, long chromStride, long dstStride); | ||
88 | |||
89 | /** | ||
90 | * Width should be a multiple of 16. | ||
91 | */ | ||
92 | extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
93 | long width, long height, | ||
94 | long lumStride, long chromStride, long dstStride); | ||
95 | |||
96 | /** | ||
97 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
98 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
99 | */ | ||
100 | extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
101 | long width, long height, | ||
102 | long lumStride, long chromStride, long srcStride); | ||
103 | |||
104 | /** | ||
105 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
106 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
107 | */ | ||
108 | extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
109 | long width, long height, | ||
110 | long lumStride, long chromStride, long dstStride); | ||
111 | |||
112 | /** | ||
113 | * Height should be a multiple of 2 and width should be a multiple of 2. | ||
114 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
115 | * Chrominance data is only taken from every second line, others are ignored. | ||
116 | * FIXME: Write HQ version. | ||
117 | */ | ||
118 | extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
119 | long width, long height, | ||
120 | long lumStride, long chromStride, long srcStride); | ||
121 | extern void (*planar2x)(const uint8_t *src, uint8_t *dst, long width, long height, | ||
122 | long srcStride, long dstStride); | ||
123 | |||
124 | extern void (*interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dst, | ||
125 | long width, long height, long src1Stride, | ||
126 | long src2Stride, long dstStride); | ||
127 | |||
128 | extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, | ||
129 | uint8_t *dst1, uint8_t *dst2, | ||
130 | long width, long height, | ||
131 | long srcStride1, long srcStride2, | ||
132 | long dstStride1, long dstStride2); | ||
133 | |||
134 | extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | ||
135 | uint8_t *dst, | ||
136 | long width, long height, | ||
137 | long srcStride1, long srcStride2, | ||
138 | long srcStride3, long dstStride); | ||
139 | |||
140 | void sws_rgb2rgb_init(int flags); | ||
141 | |||
142 | #endif /* FFMPEG_RGB2RGB_H */ | ||
diff --git a/src/plugins/ffmpeg/libswscale/rgb2rgb_template.c b/src/plugins/ffmpeg/libswscale/rgb2rgb_template.c deleted file mode 100644 index ffbf2c7..0000000 --- a/src/plugins/ffmpeg/libswscale/rgb2rgb_template.c +++ /dev/null | |||
@@ -1,2738 +0,0 @@ | |||
1 | /* | ||
2 | * software RGB to RGB converter | ||
3 | * pluralize by software PAL8 to RGB converter | ||
4 | * software YUV to YUV converter | ||
5 | * software YUV to RGB converter | ||
6 | * Written by Nick Kurshev. | ||
7 | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) | ||
8 | * lot of big-endian byte order fixes by Alex Beregszaszi | ||
9 | * | ||
10 | * This file is part of FFmpeg. | ||
11 | * | ||
12 | * FFmpeg is free software; you can redistribute it and/or modify | ||
13 | * it under the terms of the GNU General Public License as published by | ||
14 | * the Free Software Foundation; either version 2 of the License, or | ||
15 | * (at your option) any later version. | ||
16 | * | ||
17 | * FFmpeg is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
20 | * GNU General Public License for more details. | ||
21 | * | ||
22 | * You should have received a copy of the GNU General Public License | ||
23 | * along with FFmpeg; if not, write to the Free Software | ||
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
25 | * | ||
26 | * The C code (not assembly, MMX, ...) of this file can be used | ||
27 | * under the LGPL license. | ||
28 | */ | ||
29 | |||
30 | #include <stddef.h> | ||
31 | #include <inttypes.h> /* for __WORDSIZE */ | ||
32 | |||
33 | #ifndef __WORDSIZE | ||
34 | // #warning You have a misconfigured system and will probably lose performance! | ||
35 | #define __WORDSIZE MP_WORDSIZE | ||
36 | #endif | ||
37 | |||
38 | #undef PREFETCH | ||
39 | #undef MOVNTQ | ||
40 | #undef EMMS | ||
41 | #undef SFENCE | ||
42 | #undef MMREG_SIZE | ||
43 | #undef PREFETCHW | ||
44 | #undef PAVGB | ||
45 | |||
46 | #ifdef HAVE_SSE2 | ||
47 | #define MMREG_SIZE 16 | ||
48 | #else | ||
49 | #define MMREG_SIZE 8 | ||
50 | #endif | ||
51 | |||
52 | #ifdef HAVE_3DNOW | ||
53 | #define PREFETCH "prefetch" | ||
54 | #define PREFETCHW "prefetchw" | ||
55 | #define PAVGB "pavgusb" | ||
56 | #elif defined (HAVE_MMX2) | ||
57 | #define PREFETCH "prefetchnta" | ||
58 | #define PREFETCHW "prefetcht0" | ||
59 | #define PAVGB "pavgb" | ||
60 | #else | ||
61 | #ifdef __APPLE__ | ||
62 | #define PREFETCH "#" | ||
63 | #define PREFETCHW "#" | ||
64 | #else | ||
65 | #define PREFETCH " # nop" | ||
66 | #define PREFETCHW " # nop" | ||
67 | #endif | ||
68 | #endif | ||
69 | |||
70 | #ifdef HAVE_3DNOW | ||
71 | /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */ | ||
72 | #define EMMS "femms" | ||
73 | #else | ||
74 | #define EMMS "emms" | ||
75 | #endif | ||
76 | |||
77 | #ifdef HAVE_MMX2 | ||
78 | #define MOVNTQ "movntq" | ||
79 | #define SFENCE "sfence" | ||
80 | #else | ||
81 | #define MOVNTQ "movq" | ||
82 | #define SFENCE " # nop" | ||
83 | #endif | ||
84 | |||
85 | static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size) | ||
86 | { | ||
87 | uint8_t *dest = dst; | ||
88 | const uint8_t *s = src; | ||
89 | const uint8_t *end; | ||
90 | #ifdef HAVE_MMX | ||
91 | const uint8_t *mm_end; | ||
92 | #endif | ||
93 | end = s + src_size; | ||
94 | #ifdef HAVE_MMX | ||
95 | asm volatile(PREFETCH" %0"::"m"(*s):"memory"); | ||
96 | mm_end = end - 23; | ||
97 | asm volatile("movq %0, %%mm7"::"m"(mask32):"memory"); | ||
98 | while (s < mm_end) | ||
99 | { | ||
100 | asm volatile( | ||
101 | PREFETCH" 32%1 \n\t" | ||
102 | "movd %1, %%mm0 \n\t" | ||
103 | "punpckldq 3%1, %%mm0 \n\t" | ||
104 | "movd 6%1, %%mm1 \n\t" | ||
105 | "punpckldq 9%1, %%mm1 \n\t" | ||
106 | "movd 12%1, %%mm2 \n\t" | ||
107 | "punpckldq 15%1, %%mm2 \n\t" | ||
108 | "movd 18%1, %%mm3 \n\t" | ||
109 | "punpckldq 21%1, %%mm3 \n\t" | ||
110 | "pand %%mm7, %%mm0 \n\t" | ||
111 | "pand %%mm7, %%mm1 \n\t" | ||
112 | "pand %%mm7, %%mm2 \n\t" | ||
113 | "pand %%mm7, %%mm3 \n\t" | ||
114 | MOVNTQ" %%mm0, %0 \n\t" | ||
115 | MOVNTQ" %%mm1, 8%0 \n\t" | ||
116 | MOVNTQ" %%mm2, 16%0 \n\t" | ||
117 | MOVNTQ" %%mm3, 24%0" | ||
118 | :"=m"(*dest) | ||
119 | :"m"(*s) | ||
120 | :"memory"); | ||
121 | dest += 32; | ||
122 | s += 24; | ||
123 | } | ||
124 | asm volatile(SFENCE:::"memory"); | ||
125 | asm volatile(EMMS:::"memory"); | ||
126 | #endif | ||
127 | while (s < end) | ||
128 | { | ||
129 | #ifdef WORDS_BIGENDIAN | ||
130 | /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */ | ||
131 | *dest++ = 0; | ||
132 | *dest++ = s[2]; | ||
133 | *dest++ = s[1]; | ||
134 | *dest++ = s[0]; | ||
135 | s+=3; | ||
136 | #else | ||
137 | *dest++ = *s++; | ||
138 | *dest++ = *s++; | ||
139 | *dest++ = *s++; | ||
140 | *dest++ = 0; | ||
141 | #endif | ||
142 | } | ||
143 | } | ||
144 | |||
145 | static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size) | ||
146 | { | ||
147 | uint8_t *dest = dst; | ||
148 | const uint8_t *s = src; | ||
149 | const uint8_t *end; | ||
150 | #ifdef HAVE_MMX | ||
151 | const uint8_t *mm_end; | ||
152 | #endif | ||
153 | end = s + src_size; | ||
154 | #ifdef HAVE_MMX | ||
155 | asm volatile(PREFETCH" %0"::"m"(*s):"memory"); | ||
156 | mm_end = end - 31; | ||
157 | while (s < mm_end) | ||
158 | { | ||
159 | asm volatile( | ||
160 | PREFETCH" 32%1 \n\t" | ||
161 | "movq %1, %%mm0 \n\t" | ||
162 | "movq 8%1, %%mm1 \n\t" | ||
163 | "movq 16%1, %%mm4 \n\t" | ||
164 | "movq 24%1, %%mm5 \n\t" | ||
165 | "movq %%mm0, %%mm2 \n\t" | ||
166 | "movq %%mm1, %%mm3 \n\t" | ||
167 | "movq %%mm4, %%mm6 \n\t" | ||
168 | "movq %%mm5, %%mm7 \n\t" | ||
169 | "psrlq $8, %%mm2 \n\t" | ||
170 | "psrlq $8, %%mm3 \n\t" | ||
171 | "psrlq $8, %%mm6 \n\t" | ||
172 | "psrlq $8, %%mm7 \n\t" | ||
173 | "pand %2, %%mm0 \n\t" | ||
174 | "pand %2, %%mm1 \n\t" | ||
175 | "pand %2, %%mm4 \n\t" | ||
176 | "pand %2, %%mm5 \n\t" | ||
177 | "pand %3, %%mm2 \n\t" | ||
178 | "pand %3, %%mm3 \n\t" | ||
179 | "pand %3, %%mm6 \n\t" | ||
180 | "pand %3, %%mm7 \n\t" | ||
181 | "por %%mm2, %%mm0 \n\t" | ||
182 | "por %%mm3, %%mm1 \n\t" | ||
183 | "por %%mm6, %%mm4 \n\t" | ||
184 | "por %%mm7, %%mm5 \n\t" | ||
185 | |||
186 | "movq %%mm1, %%mm2 \n\t" | ||
187 | "movq %%mm4, %%mm3 \n\t" | ||
188 | "psllq $48, %%mm2 \n\t" | ||
189 | "psllq $32, %%mm3 \n\t" | ||
190 | "pand %4, %%mm2 \n\t" | ||
191 | "pand %5, %%mm3 \n\t" | ||
192 | "por %%mm2, %%mm0 \n\t" | ||
193 | "psrlq $16, %%mm1 \n\t" | ||
194 | "psrlq $32, %%mm4 \n\t" | ||
195 | "psllq $16, %%mm5 \n\t" | ||
196 | "por %%mm3, %%mm1 \n\t" | ||
197 | "pand %6, %%mm5 \n\t" | ||
198 | "por %%mm5, %%mm4 \n\t" | ||
199 | |||
200 | MOVNTQ" %%mm0, %0 \n\t" | ||
201 | MOVNTQ" %%mm1, 8%0 \n\t" | ||
202 | MOVNTQ" %%mm4, 16%0" | ||
203 | :"=m"(*dest) | ||
204 | :"m"(*s),"m"(mask24l), | ||
205 | "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | ||
206 | :"memory"); | ||
207 | dest += 24; | ||
208 | s += 32; | ||
209 | } | ||
210 | asm volatile(SFENCE:::"memory"); | ||
211 | asm volatile(EMMS:::"memory"); | ||
212 | #endif | ||
213 | while (s < end) | ||
214 | { | ||
215 | #ifdef WORDS_BIGENDIAN | ||
216 | /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */ | ||
217 | s++; | ||
218 | dest[2] = *s++; | ||
219 | dest[1] = *s++; | ||
220 | dest[0] = *s++; | ||
221 | dest += 3; | ||
222 | #else | ||
223 | *dest++ = *s++; | ||
224 | *dest++ = *s++; | ||
225 | *dest++ = *s++; | ||
226 | s++; | ||
227 | #endif | ||
228 | } | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | original by Strepto/Astral | ||
233 | ported to gcc & bugfixed: A'rpi | ||
234 | MMX2, 3DNOW optimization by Nick Kurshev | ||
235 | 32-bit C version, and and&add trick by Michael Niedermayer | ||
236 | */ | ||
237 | static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size) | ||
238 | { | ||
239 | register const uint8_t* s=src; | ||
240 | register uint8_t* d=dst; | ||
241 | register const uint8_t *end; | ||
242 | const uint8_t *mm_end; | ||
243 | end = s + src_size; | ||
244 | #ifdef HAVE_MMX | ||
245 | asm volatile(PREFETCH" %0"::"m"(*s)); | ||
246 | asm volatile("movq %0, %%mm4"::"m"(mask15s)); | ||
247 | mm_end = end - 15; | ||
248 | while (s<mm_end) | ||
249 | { | ||
250 | asm volatile( | ||
251 | PREFETCH" 32%1 \n\t" | ||
252 | "movq %1, %%mm0 \n\t" | ||
253 | "movq 8%1, %%mm2 \n\t" | ||
254 | "movq %%mm0, %%mm1 \n\t" | ||
255 | "movq %%mm2, %%mm3 \n\t" | ||
256 | "pand %%mm4, %%mm0 \n\t" | ||
257 | "pand %%mm4, %%mm2 \n\t" | ||
258 | "paddw %%mm1, %%mm0 \n\t" | ||
259 | "paddw %%mm3, %%mm2 \n\t" | ||
260 | MOVNTQ" %%mm0, %0 \n\t" | ||
261 | MOVNTQ" %%mm2, 8%0" | ||
262 | :"=m"(*d) | ||
263 | :"m"(*s) | ||
264 | ); | ||
265 | d+=16; | ||
266 | s+=16; | ||
267 | } | ||
268 | asm volatile(SFENCE:::"memory"); | ||
269 | asm volatile(EMMS:::"memory"); | ||
270 | #endif | ||
271 | mm_end = end - 3; | ||
272 | while (s < mm_end) | ||
273 | { | ||
274 | register unsigned x= *((const uint32_t *)s); | ||
275 | *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); | ||
276 | d+=4; | ||
277 | s+=4; | ||
278 | } | ||
279 | if (s < end) | ||
280 | { | ||
281 | register unsigned short x= *((const uint16_t *)s); | ||
282 | *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); | ||
283 | } | ||
284 | } | ||
285 | |||
286 | static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size) | ||
287 | { | ||
288 | register const uint8_t* s=src; | ||
289 | register uint8_t* d=dst; | ||
290 | register const uint8_t *end; | ||
291 | const uint8_t *mm_end; | ||
292 | end = s + src_size; | ||
293 | #ifdef HAVE_MMX | ||
294 | asm volatile(PREFETCH" %0"::"m"(*s)); | ||
295 | asm volatile("movq %0, %%mm7"::"m"(mask15rg)); | ||
296 | asm volatile("movq %0, %%mm6"::"m"(mask15b)); | ||
297 | mm_end = end - 15; | ||
298 | while (s<mm_end) | ||
299 | { | ||
300 | asm volatile( | ||
301 | PREFETCH" 32%1 \n\t" | ||
302 | "movq %1, %%mm0 \n\t" | ||
303 | "movq 8%1, %%mm2 \n\t" | ||
304 | "movq %%mm0, %%mm1 \n\t" | ||
305 | "movq %%mm2, %%mm3 \n\t" | ||
306 | "psrlq $1, %%mm0 \n\t" | ||
307 | "psrlq $1, %%mm2 \n\t" | ||
308 | "pand %%mm7, %%mm0 \n\t" | ||
309 | "pand %%mm7, %%mm2 \n\t" | ||
310 | "pand %%mm6, %%mm1 \n\t" | ||
311 | "pand %%mm6, %%mm3 \n\t" | ||
312 | "por %%mm1, %%mm0 \n\t" | ||
313 | "por %%mm3, %%mm2 \n\t" | ||
314 | MOVNTQ" %%mm0, %0 \n\t" | ||
315 | MOVNTQ" %%mm2, 8%0" | ||
316 | :"=m"(*d) | ||
317 | :"m"(*s) | ||
318 | ); | ||
319 | d+=16; | ||
320 | s+=16; | ||
321 | } | ||
322 | asm volatile(SFENCE:::"memory"); | ||
323 | asm volatile(EMMS:::"memory"); | ||
324 | #endif | ||
325 | mm_end = end - 3; | ||
326 | while (s < mm_end) | ||
327 | { | ||
328 | register uint32_t x= *((const uint32_t*)s); | ||
329 | *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); | ||
330 | s+=4; | ||
331 | d+=4; | ||
332 | } | ||
333 | if (s < end) | ||
334 | { | ||
335 | register uint16_t x= *((const uint16_t*)s); | ||
336 | *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); | ||
337 | s+=2; | ||
338 | d+=2; | ||
339 | } | ||
340 | } | ||
341 | |||
342 | static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size) | ||
343 | { | ||
344 | const uint8_t *s = src; | ||
345 | const uint8_t *end; | ||
346 | #ifdef HAVE_MMX | ||
347 | const uint8_t *mm_end; | ||
348 | #endif | ||
349 | uint16_t *d = (uint16_t *)dst; | ||
350 | end = s + src_size; | ||
351 | #ifdef HAVE_MMX | ||
352 | mm_end = end - 15; | ||
353 | #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) | ||
354 | asm volatile( | ||
355 | "movq %3, %%mm5 \n\t" | ||
356 | "movq %4, %%mm6 \n\t" | ||
357 | "movq %5, %%mm7 \n\t" | ||
358 | "jmp 2f \n\t" | ||
359 | ASMALIGN(4) | ||
360 | "1: \n\t" | ||
361 | PREFETCH" 32(%1) \n\t" | ||
362 | "movd (%1), %%mm0 \n\t" | ||
363 | "movd 4(%1), %%mm3 \n\t" | ||
364 | "punpckldq 8(%1), %%mm0 \n\t" | ||
365 | "punpckldq 12(%1), %%mm3 \n\t" | ||
366 | "movq %%mm0, %%mm1 \n\t" | ||
367 | "movq %%mm3, %%mm4 \n\t" | ||
368 | "pand %%mm6, %%mm0 \n\t" | ||
369 | "pand %%mm6, %%mm3 \n\t" | ||
370 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
371 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
372 | "pand %%mm5, %%mm1 \n\t" | ||
373 | "pand %%mm5, %%mm4 \n\t" | ||
374 | "por %%mm1, %%mm0 \n\t" | ||
375 | "por %%mm4, %%mm3 \n\t" | ||
376 | "psrld $5, %%mm0 \n\t" | ||
377 | "pslld $11, %%mm3 \n\t" | ||
378 | "por %%mm3, %%mm0 \n\t" | ||
379 | MOVNTQ" %%mm0, (%0) \n\t" | ||
380 | "add $16, %1 \n\t" | ||
381 | "add $8, %0 \n\t" | ||
382 | "2: \n\t" | ||
383 | "cmp %2, %1 \n\t" | ||
384 | " jb 1b \n\t" | ||
385 | : "+r" (d), "+r"(s) | ||
386 | : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) | ||
387 | ); | ||
388 | #else | ||
389 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
390 | asm volatile( | ||
391 | "movq %0, %%mm7 \n\t" | ||
392 | "movq %1, %%mm6 \n\t" | ||
393 | ::"m"(red_16mask),"m"(green_16mask)); | ||
394 | while (s < mm_end) | ||
395 | { | ||
396 | asm volatile( | ||
397 | PREFETCH" 32%1 \n\t" | ||
398 | "movd %1, %%mm0 \n\t" | ||
399 | "movd 4%1, %%mm3 \n\t" | ||
400 | "punpckldq 8%1, %%mm0 \n\t" | ||
401 | "punpckldq 12%1, %%mm3 \n\t" | ||
402 | "movq %%mm0, %%mm1 \n\t" | ||
403 | "movq %%mm0, %%mm2 \n\t" | ||
404 | "movq %%mm3, %%mm4 \n\t" | ||
405 | "movq %%mm3, %%mm5 \n\t" | ||
406 | "psrlq $3, %%mm0 \n\t" | ||
407 | "psrlq $3, %%mm3 \n\t" | ||
408 | "pand %2, %%mm0 \n\t" | ||
409 | "pand %2, %%mm3 \n\t" | ||
410 | "psrlq $5, %%mm1 \n\t" | ||
411 | "psrlq $5, %%mm4 \n\t" | ||
412 | "pand %%mm6, %%mm1 \n\t" | ||
413 | "pand %%mm6, %%mm4 \n\t" | ||
414 | "psrlq $8, %%mm2 \n\t" | ||
415 | "psrlq $8, %%mm5 \n\t" | ||
416 | "pand %%mm7, %%mm2 \n\t" | ||
417 | "pand %%mm7, %%mm5 \n\t" | ||
418 | "por %%mm1, %%mm0 \n\t" | ||
419 | "por %%mm4, %%mm3 \n\t" | ||
420 | "por %%mm2, %%mm0 \n\t" | ||
421 | "por %%mm5, %%mm3 \n\t" | ||
422 | "psllq $16, %%mm3 \n\t" | ||
423 | "por %%mm3, %%mm0 \n\t" | ||
424 | MOVNTQ" %%mm0, %0 \n\t" | ||
425 | :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | ||
426 | d += 4; | ||
427 | s += 16; | ||
428 | } | ||
429 | #endif | ||
430 | asm volatile(SFENCE:::"memory"); | ||
431 | asm volatile(EMMS:::"memory"); | ||
432 | #endif | ||
433 | while (s < end) | ||
434 | { | ||
435 | register int rgb = *(const uint32_t*)s; s += 4; | ||
436 | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); | ||
437 | } | ||
438 | } | ||
439 | |||
440 | static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) | ||
441 | { | ||
442 | const uint8_t *s = src; | ||
443 | const uint8_t *end; | ||
444 | #ifdef HAVE_MMX | ||
445 | const uint8_t *mm_end; | ||
446 | #endif | ||
447 | uint16_t *d = (uint16_t *)dst; | ||
448 | end = s + src_size; | ||
449 | #ifdef HAVE_MMX | ||
450 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
451 | asm volatile( | ||
452 | "movq %0, %%mm7 \n\t" | ||
453 | "movq %1, %%mm6 \n\t" | ||
454 | ::"m"(red_16mask),"m"(green_16mask)); | ||
455 | mm_end = end - 15; | ||
456 | while (s < mm_end) | ||
457 | { | ||
458 | asm volatile( | ||
459 | PREFETCH" 32%1 \n\t" | ||
460 | "movd %1, %%mm0 \n\t" | ||
461 | "movd 4%1, %%mm3 \n\t" | ||
462 | "punpckldq 8%1, %%mm0 \n\t" | ||
463 | "punpckldq 12%1, %%mm3 \n\t" | ||
464 | "movq %%mm0, %%mm1 \n\t" | ||
465 | "movq %%mm0, %%mm2 \n\t" | ||
466 | "movq %%mm3, %%mm4 \n\t" | ||
467 | "movq %%mm3, %%mm5 \n\t" | ||
468 | "psllq $8, %%mm0 \n\t" | ||
469 | "psllq $8, %%mm3 \n\t" | ||
470 | "pand %%mm7, %%mm0 \n\t" | ||
471 | "pand %%mm7, %%mm3 \n\t" | ||
472 | "psrlq $5, %%mm1 \n\t" | ||
473 | "psrlq $5, %%mm4 \n\t" | ||
474 | "pand %%mm6, %%mm1 \n\t" | ||
475 | "pand %%mm6, %%mm4 \n\t" | ||
476 | "psrlq $19, %%mm2 \n\t" | ||
477 | "psrlq $19, %%mm5 \n\t" | ||
478 | "pand %2, %%mm2 \n\t" | ||
479 | "pand %2, %%mm5 \n\t" | ||
480 | "por %%mm1, %%mm0 \n\t" | ||
481 | "por %%mm4, %%mm3 \n\t" | ||
482 | "por %%mm2, %%mm0 \n\t" | ||
483 | "por %%mm5, %%mm3 \n\t" | ||
484 | "psllq $16, %%mm3 \n\t" | ||
485 | "por %%mm3, %%mm0 \n\t" | ||
486 | MOVNTQ" %%mm0, %0 \n\t" | ||
487 | :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | ||
488 | d += 4; | ||
489 | s += 16; | ||
490 | } | ||
491 | asm volatile(SFENCE:::"memory"); | ||
492 | asm volatile(EMMS:::"memory"); | ||
493 | #endif | ||
494 | while (s < end) | ||
495 | { | ||
496 | register int rgb = *(const uint32_t*)s; s += 4; | ||
497 | *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); | ||
498 | } | ||
499 | } | ||
500 | |||
501 | static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size) | ||
502 | { | ||
503 | const uint8_t *s = src; | ||
504 | const uint8_t *end; | ||
505 | #ifdef HAVE_MMX | ||
506 | const uint8_t *mm_end; | ||
507 | #endif | ||
508 | uint16_t *d = (uint16_t *)dst; | ||
509 | end = s + src_size; | ||
510 | #ifdef HAVE_MMX | ||
511 | mm_end = end - 15; | ||
512 | #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) | ||
513 | asm volatile( | ||
514 | "movq %3, %%mm5 \n\t" | ||
515 | "movq %4, %%mm6 \n\t" | ||
516 | "movq %5, %%mm7 \n\t" | ||
517 | "jmp 2f \n\t" | ||
518 | ASMALIGN(4) | ||
519 | "1: \n\t" | ||
520 | PREFETCH" 32(%1) \n\t" | ||
521 | "movd (%1), %%mm0 \n\t" | ||
522 | "movd 4(%1), %%mm3 \n\t" | ||
523 | "punpckldq 8(%1), %%mm0 \n\t" | ||
524 | "punpckldq 12(%1), %%mm3 \n\t" | ||
525 | "movq %%mm0, %%mm1 \n\t" | ||
526 | "movq %%mm3, %%mm4 \n\t" | ||
527 | "pand %%mm6, %%mm0 \n\t" | ||
528 | "pand %%mm6, %%mm3 \n\t" | ||
529 | "pmaddwd %%mm7, %%mm0 \n\t" | ||
530 | "pmaddwd %%mm7, %%mm3 \n\t" | ||
531 | "pand %%mm5, %%mm1 \n\t" | ||
532 | "pand %%mm5, %%mm4 \n\t" | ||
533 | "por %%mm1, %%mm0 \n\t" | ||
534 | "por %%mm4, %%mm3 \n\t" | ||
535 | "psrld $6, %%mm0 \n\t" | ||
536 | "pslld $10, %%mm3 \n\t" | ||
537 | "por %%mm3, %%mm0 \n\t" | ||
538 | MOVNTQ" %%mm0, (%0) \n\t" | ||
539 | "add $16, %1 \n\t" | ||
540 | "add $8, %0 \n\t" | ||
541 | "2: \n\t" | ||
542 | "cmp %2, %1 \n\t" | ||
543 | " jb 1b \n\t" | ||
544 | : "+r" (d), "+r"(s) | ||
545 | : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) | ||
546 | ); | ||
547 | #else | ||
548 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
549 | asm volatile( | ||
550 | "movq %0, %%mm7 \n\t" | ||
551 | "movq %1, %%mm6 \n\t" | ||
552 | ::"m"(red_15mask),"m"(green_15mask)); | ||
553 | while (s < mm_end) | ||
554 | { | ||
555 | asm volatile( | ||
556 | PREFETCH" 32%1 \n\t" | ||
557 | "movd %1, %%mm0 \n\t" | ||
558 | "movd 4%1, %%mm3 \n\t" | ||
559 | "punpckldq 8%1, %%mm0 \n\t" | ||
560 | "punpckldq 12%1, %%mm3 \n\t" | ||
561 | "movq %%mm0, %%mm1 \n\t" | ||
562 | "movq %%mm0, %%mm2 \n\t" | ||
563 | "movq %%mm3, %%mm4 \n\t" | ||
564 | "movq %%mm3, %%mm5 \n\t" | ||
565 | "psrlq $3, %%mm0 \n\t" | ||
566 | "psrlq $3, %%mm3 \n\t" | ||
567 | "pand %2, %%mm0 \n\t" | ||
568 | "pand %2, %%mm3 \n\t" | ||
569 | "psrlq $6, %%mm1 \n\t" | ||
570 | "psrlq $6, %%mm4 \n\t" | ||
571 | "pand %%mm6, %%mm1 \n\t" | ||
572 | "pand %%mm6, %%mm4 \n\t" | ||
573 | "psrlq $9, %%mm2 \n\t" | ||
574 | "psrlq $9, %%mm5 \n\t" | ||
575 | "pand %%mm7, %%mm2 \n\t" | ||
576 | "pand %%mm7, %%mm5 \n\t" | ||
577 | "por %%mm1, %%mm0 \n\t" | ||
578 | "por %%mm4, %%mm3 \n\t" | ||
579 | "por %%mm2, %%mm0 \n\t" | ||
580 | "por %%mm5, %%mm3 \n\t" | ||
581 | "psllq $16, %%mm3 \n\t" | ||
582 | "por %%mm3, %%mm0 \n\t" | ||
583 | MOVNTQ" %%mm0, %0 \n\t" | ||
584 | :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | ||
585 | d += 4; | ||
586 | s += 16; | ||
587 | } | ||
588 | #endif | ||
589 | asm volatile(SFENCE:::"memory"); | ||
590 | asm volatile(EMMS:::"memory"); | ||
591 | #endif | ||
592 | while (s < end) | ||
593 | { | ||
594 | register int rgb = *(const uint32_t*)s; s += 4; | ||
595 | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); | ||
596 | } | ||
597 | } | ||
598 | |||
599 | static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) | ||
600 | { | ||
601 | const uint8_t *s = src; | ||
602 | const uint8_t *end; | ||
603 | #ifdef HAVE_MMX | ||
604 | const uint8_t *mm_end; | ||
605 | #endif | ||
606 | uint16_t *d = (uint16_t *)dst; | ||
607 | end = s + src_size; | ||
608 | #ifdef HAVE_MMX | ||
609 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
610 | asm volatile( | ||
611 | "movq %0, %%mm7 \n\t" | ||
612 | "movq %1, %%mm6 \n\t" | ||
613 | ::"m"(red_15mask),"m"(green_15mask)); | ||
614 | mm_end = end - 15; | ||
615 | while (s < mm_end) | ||
616 | { | ||
617 | asm volatile( | ||
618 | PREFETCH" 32%1 \n\t" | ||
619 | "movd %1, %%mm0 \n\t" | ||
620 | "movd 4%1, %%mm3 \n\t" | ||
621 | "punpckldq 8%1, %%mm0 \n\t" | ||
622 | "punpckldq 12%1, %%mm3 \n\t" | ||
623 | "movq %%mm0, %%mm1 \n\t" | ||
624 | "movq %%mm0, %%mm2 \n\t" | ||
625 | "movq %%mm3, %%mm4 \n\t" | ||
626 | "movq %%mm3, %%mm5 \n\t" | ||
627 | "psllq $7, %%mm0 \n\t" | ||
628 | "psllq $7, %%mm3 \n\t" | ||
629 | "pand %%mm7, %%mm0 \n\t" | ||
630 | "pand %%mm7, %%mm3 \n\t" | ||
631 | "psrlq $6, %%mm1 \n\t" | ||
632 | "psrlq $6, %%mm4 \n\t" | ||
633 | "pand %%mm6, %%mm1 \n\t" | ||
634 | "pand %%mm6, %%mm4 \n\t" | ||
635 | "psrlq $19, %%mm2 \n\t" | ||
636 | "psrlq $19, %%mm5 \n\t" | ||
637 | "pand %2, %%mm2 \n\t" | ||
638 | "pand %2, %%mm5 \n\t" | ||
639 | "por %%mm1, %%mm0 \n\t" | ||
640 | "por %%mm4, %%mm3 \n\t" | ||
641 | "por %%mm2, %%mm0 \n\t" | ||
642 | "por %%mm5, %%mm3 \n\t" | ||
643 | "psllq $16, %%mm3 \n\t" | ||
644 | "por %%mm3, %%mm0 \n\t" | ||
645 | MOVNTQ" %%mm0, %0 \n\t" | ||
646 | :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | ||
647 | d += 4; | ||
648 | s += 16; | ||
649 | } | ||
650 | asm volatile(SFENCE:::"memory"); | ||
651 | asm volatile(EMMS:::"memory"); | ||
652 | #endif | ||
653 | while (s < end) | ||
654 | { | ||
655 | register int rgb = *(const uint32_t*)s; s += 4; | ||
656 | *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); | ||
657 | } | ||
658 | } | ||
659 | |||
660 | static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size) | ||
661 | { | ||
662 | const uint8_t *s = src; | ||
663 | const uint8_t *end; | ||
664 | #ifdef HAVE_MMX | ||
665 | const uint8_t *mm_end; | ||
666 | #endif | ||
667 | uint16_t *d = (uint16_t *)dst; | ||
668 | end = s + src_size; | ||
669 | #ifdef HAVE_MMX | ||
670 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
671 | asm volatile( | ||
672 | "movq %0, %%mm7 \n\t" | ||
673 | "movq %1, %%mm6 \n\t" | ||
674 | ::"m"(red_16mask),"m"(green_16mask)); | ||
675 | mm_end = end - 11; | ||
676 | while (s < mm_end) | ||
677 | { | ||
678 | asm volatile( | ||
679 | PREFETCH" 32%1 \n\t" | ||
680 | "movd %1, %%mm0 \n\t" | ||
681 | "movd 3%1, %%mm3 \n\t" | ||
682 | "punpckldq 6%1, %%mm0 \n\t" | ||
683 | "punpckldq 9%1, %%mm3 \n\t" | ||
684 | "movq %%mm0, %%mm1 \n\t" | ||
685 | "movq %%mm0, %%mm2 \n\t" | ||
686 | "movq %%mm3, %%mm4 \n\t" | ||
687 | "movq %%mm3, %%mm5 \n\t" | ||
688 | "psrlq $3, %%mm0 \n\t" | ||
689 | "psrlq $3, %%mm3 \n\t" | ||
690 | "pand %2, %%mm0 \n\t" | ||
691 | "pand %2, %%mm3 \n\t" | ||
692 | "psrlq $5, %%mm1 \n\t" | ||
693 | "psrlq $5, %%mm4 \n\t" | ||
694 | "pand %%mm6, %%mm1 \n\t" | ||
695 | "pand %%mm6, %%mm4 \n\t" | ||
696 | "psrlq $8, %%mm2 \n\t" | ||
697 | "psrlq $8, %%mm5 \n\t" | ||
698 | "pand %%mm7, %%mm2 \n\t" | ||
699 | "pand %%mm7, %%mm5 \n\t" | ||
700 | "por %%mm1, %%mm0 \n\t" | ||
701 | "por %%mm4, %%mm3 \n\t" | ||
702 | "por %%mm2, %%mm0 \n\t" | ||
703 | "por %%mm5, %%mm3 \n\t" | ||
704 | "psllq $16, %%mm3 \n\t" | ||
705 | "por %%mm3, %%mm0 \n\t" | ||
706 | MOVNTQ" %%mm0, %0 \n\t" | ||
707 | :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | ||
708 | d += 4; | ||
709 | s += 12; | ||
710 | } | ||
711 | asm volatile(SFENCE:::"memory"); | ||
712 | asm volatile(EMMS:::"memory"); | ||
713 | #endif | ||
714 | while (s < end) | ||
715 | { | ||
716 | const int b = *s++; | ||
717 | const int g = *s++; | ||
718 | const int r = *s++; | ||
719 | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | ||
720 | } | ||
721 | } | ||
722 | |||
723 | static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size) | ||
724 | { | ||
725 | const uint8_t *s = src; | ||
726 | const uint8_t *end; | ||
727 | #ifdef HAVE_MMX | ||
728 | const uint8_t *mm_end; | ||
729 | #endif | ||
730 | uint16_t *d = (uint16_t *)dst; | ||
731 | end = s + src_size; | ||
732 | #ifdef HAVE_MMX | ||
733 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
734 | asm volatile( | ||
735 | "movq %0, %%mm7 \n\t" | ||
736 | "movq %1, %%mm6 \n\t" | ||
737 | ::"m"(red_16mask),"m"(green_16mask)); | ||
738 | mm_end = end - 15; | ||
739 | while (s < mm_end) | ||
740 | { | ||
741 | asm volatile( | ||
742 | PREFETCH" 32%1 \n\t" | ||
743 | "movd %1, %%mm0 \n\t" | ||
744 | "movd 3%1, %%mm3 \n\t" | ||
745 | "punpckldq 6%1, %%mm0 \n\t" | ||
746 | "punpckldq 9%1, %%mm3 \n\t" | ||
747 | "movq %%mm0, %%mm1 \n\t" | ||
748 | "movq %%mm0, %%mm2 \n\t" | ||
749 | "movq %%mm3, %%mm4 \n\t" | ||
750 | "movq %%mm3, %%mm5 \n\t" | ||
751 | "psllq $8, %%mm0 \n\t" | ||
752 | "psllq $8, %%mm3 \n\t" | ||
753 | "pand %%mm7, %%mm0 \n\t" | ||
754 | "pand %%mm7, %%mm3 \n\t" | ||
755 | "psrlq $5, %%mm1 \n\t" | ||
756 | "psrlq $5, %%mm4 \n\t" | ||
757 | "pand %%mm6, %%mm1 \n\t" | ||
758 | "pand %%mm6, %%mm4 \n\t" | ||
759 | "psrlq $19, %%mm2 \n\t" | ||
760 | "psrlq $19, %%mm5 \n\t" | ||
761 | "pand %2, %%mm2 \n\t" | ||
762 | "pand %2, %%mm5 \n\t" | ||
763 | "por %%mm1, %%mm0 \n\t" | ||
764 | "por %%mm4, %%mm3 \n\t" | ||
765 | "por %%mm2, %%mm0 \n\t" | ||
766 | "por %%mm5, %%mm3 \n\t" | ||
767 | "psllq $16, %%mm3 \n\t" | ||
768 | "por %%mm3, %%mm0 \n\t" | ||
769 | MOVNTQ" %%mm0, %0 \n\t" | ||
770 | :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); | ||
771 | d += 4; | ||
772 | s += 12; | ||
773 | } | ||
774 | asm volatile(SFENCE:::"memory"); | ||
775 | asm volatile(EMMS:::"memory"); | ||
776 | #endif | ||
777 | while (s < end) | ||
778 | { | ||
779 | const int r = *s++; | ||
780 | const int g = *s++; | ||
781 | const int b = *s++; | ||
782 | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); | ||
783 | } | ||
784 | } | ||
785 | |||
786 | static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size) | ||
787 | { | ||
788 | const uint8_t *s = src; | ||
789 | const uint8_t *end; | ||
790 | #ifdef HAVE_MMX | ||
791 | const uint8_t *mm_end; | ||
792 | #endif | ||
793 | uint16_t *d = (uint16_t *)dst; | ||
794 | end = s + src_size; | ||
795 | #ifdef HAVE_MMX | ||
796 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
797 | asm volatile( | ||
798 | "movq %0, %%mm7 \n\t" | ||
799 | "movq %1, %%mm6 \n\t" | ||
800 | ::"m"(red_15mask),"m"(green_15mask)); | ||
801 | mm_end = end - 11; | ||
802 | while (s < mm_end) | ||
803 | { | ||
804 | asm volatile( | ||
805 | PREFETCH" 32%1 \n\t" | ||
806 | "movd %1, %%mm0 \n\t" | ||
807 | "movd 3%1, %%mm3 \n\t" | ||
808 | "punpckldq 6%1, %%mm0 \n\t" | ||
809 | "punpckldq 9%1, %%mm3 \n\t" | ||
810 | "movq %%mm0, %%mm1 \n\t" | ||
811 | "movq %%mm0, %%mm2 \n\t" | ||
812 | "movq %%mm3, %%mm4 \n\t" | ||
813 | "movq %%mm3, %%mm5 \n\t" | ||
814 | "psrlq $3, %%mm0 \n\t" | ||
815 | "psrlq $3, %%mm3 \n\t" | ||
816 | "pand %2, %%mm0 \n\t" | ||
817 | "pand %2, %%mm3 \n\t" | ||
818 | "psrlq $6, %%mm1 \n\t" | ||
819 | "psrlq $6, %%mm4 \n\t" | ||
820 | "pand %%mm6, %%mm1 \n\t" | ||
821 | "pand %%mm6, %%mm4 \n\t" | ||
822 | "psrlq $9, %%mm2 \n\t" | ||
823 | "psrlq $9, %%mm5 \n\t" | ||
824 | "pand %%mm7, %%mm2 \n\t" | ||
825 | "pand %%mm7, %%mm5 \n\t" | ||
826 | "por %%mm1, %%mm0 \n\t" | ||
827 | "por %%mm4, %%mm3 \n\t" | ||
828 | "por %%mm2, %%mm0 \n\t" | ||
829 | "por %%mm5, %%mm3 \n\t" | ||
830 | "psllq $16, %%mm3 \n\t" | ||
831 | "por %%mm3, %%mm0 \n\t" | ||
832 | MOVNTQ" %%mm0, %0 \n\t" | ||
833 | :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | ||
834 | d += 4; | ||
835 | s += 12; | ||
836 | } | ||
837 | asm volatile(SFENCE:::"memory"); | ||
838 | asm volatile(EMMS:::"memory"); | ||
839 | #endif | ||
840 | while (s < end) | ||
841 | { | ||
842 | const int b = *s++; | ||
843 | const int g = *s++; | ||
844 | const int r = *s++; | ||
845 | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | ||
846 | } | ||
847 | } | ||
848 | |||
849 | static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size) | ||
850 | { | ||
851 | const uint8_t *s = src; | ||
852 | const uint8_t *end; | ||
853 | #ifdef HAVE_MMX | ||
854 | const uint8_t *mm_end; | ||
855 | #endif | ||
856 | uint16_t *d = (uint16_t *)dst; | ||
857 | end = s + src_size; | ||
858 | #ifdef HAVE_MMX | ||
859 | asm volatile(PREFETCH" %0"::"m"(*src):"memory"); | ||
860 | asm volatile( | ||
861 | "movq %0, %%mm7 \n\t" | ||
862 | "movq %1, %%mm6 \n\t" | ||
863 | ::"m"(red_15mask),"m"(green_15mask)); | ||
864 | mm_end = end - 15; | ||
865 | while (s < mm_end) | ||
866 | { | ||
867 | asm volatile( | ||
868 | PREFETCH" 32%1 \n\t" | ||
869 | "movd %1, %%mm0 \n\t" | ||
870 | "movd 3%1, %%mm3 \n\t" | ||
871 | "punpckldq 6%1, %%mm0 \n\t" | ||
872 | "punpckldq 9%1, %%mm3 \n\t" | ||
873 | "movq %%mm0, %%mm1 \n\t" | ||
874 | "movq %%mm0, %%mm2 \n\t" | ||
875 | "movq %%mm3, %%mm4 \n\t" | ||
876 | "movq %%mm3, %%mm5 \n\t" | ||
877 | "psllq $7, %%mm0 \n\t" | ||
878 | "psllq $7, %%mm3 \n\t" | ||
879 | "pand %%mm7, %%mm0 \n\t" | ||
880 | "pand %%mm7, %%mm3 \n\t" | ||
881 | "psrlq $6, %%mm1 \n\t" | ||
882 | "psrlq $6, %%mm4 \n\t" | ||
883 | "pand %%mm6, %%mm1 \n\t" | ||
884 | "pand %%mm6, %%mm4 \n\t" | ||
885 | "psrlq $19, %%mm2 \n\t" | ||
886 | "psrlq $19, %%mm5 \n\t" | ||
887 | "pand %2, %%mm2 \n\t" | ||
888 | "pand %2, %%mm5 \n\t" | ||
889 | "por %%mm1, %%mm0 \n\t" | ||
890 | "por %%mm4, %%mm3 \n\t" | ||
891 | "por %%mm2, %%mm0 \n\t" | ||
892 | "por %%mm5, %%mm3 \n\t" | ||
893 | "psllq $16, %%mm3 \n\t" | ||
894 | "por %%mm3, %%mm0 \n\t" | ||
895 | MOVNTQ" %%mm0, %0 \n\t" | ||
896 | :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); | ||
897 | d += 4; | ||
898 | s += 12; | ||
899 | } | ||
900 | asm volatile(SFENCE:::"memory"); | ||
901 | asm volatile(EMMS:::"memory"); | ||
902 | #endif | ||
903 | while (s < end) | ||
904 | { | ||
905 | const int r = *s++; | ||
906 | const int g = *s++; | ||
907 | const int b = *s++; | ||
908 | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); | ||
909 | } | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | I use less accurate approximation here by simply left-shifting the input | ||
914 | value and filling the low order bits with zeroes. This method improves PNG | ||
915 | compression but this scheme cannot reproduce white exactly, since it does | ||
916 | not generate an all-ones maximum value; the net effect is to darken the | ||
917 | image slightly. | ||
918 | |||
919 | The better method should be "left bit replication": | ||
920 | |||
921 | 4 3 2 1 0 | ||
922 | --------- | ||
923 | 1 1 0 1 1 | ||
924 | |||
925 | 7 6 5 4 3 2 1 0 | ||
926 | ---------------- | ||
927 | 1 1 0 1 1 1 1 0 | ||
928 | |=======| |===| | ||
929 | | leftmost bits repeated to fill open bits | ||
930 | | | ||
931 | original bits | ||
932 | */ | ||
933 | static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) | ||
934 | { | ||
935 | const uint16_t *end; | ||
936 | #ifdef HAVE_MMX | ||
937 | const uint16_t *mm_end; | ||
938 | #endif | ||
939 | uint8_t *d = dst; | ||
940 | const uint16_t *s = (const uint16_t*)src; | ||
941 | end = s + src_size/2; | ||
942 | #ifdef HAVE_MMX | ||
943 | asm volatile(PREFETCH" %0"::"m"(*s):"memory"); | ||
944 | mm_end = end - 7; | ||
945 | while (s < mm_end) | ||
946 | { | ||
947 | asm volatile( | ||
948 | PREFETCH" 32%1 \n\t" | ||
949 | "movq %1, %%mm0 \n\t" | ||
950 | "movq %1, %%mm1 \n\t" | ||
951 | "movq %1, %%mm2 \n\t" | ||
952 | "pand %2, %%mm0 \n\t" | ||
953 | "pand %3, %%mm1 \n\t" | ||
954 | "pand %4, %%mm2 \n\t" | ||
955 | "psllq $3, %%mm0 \n\t" | ||
956 | "psrlq $2, %%mm1 \n\t" | ||
957 | "psrlq $7, %%mm2 \n\t" | ||
958 | "movq %%mm0, %%mm3 \n\t" | ||
959 | "movq %%mm1, %%mm4 \n\t" | ||
960 | "movq %%mm2, %%mm5 \n\t" | ||
961 | "punpcklwd %5, %%mm0 \n\t" | ||
962 | "punpcklwd %5, %%mm1 \n\t" | ||
963 | "punpcklwd %5, %%mm2 \n\t" | ||
964 | "punpckhwd %5, %%mm3 \n\t" | ||
965 | "punpckhwd %5, %%mm4 \n\t" | ||
966 | "punpckhwd %5, %%mm5 \n\t" | ||
967 | "psllq $8, %%mm1 \n\t" | ||
968 | "psllq $16, %%mm2 \n\t" | ||
969 | "por %%mm1, %%mm0 \n\t" | ||
970 | "por %%mm2, %%mm0 \n\t" | ||
971 | "psllq $8, %%mm4 \n\t" | ||
972 | "psllq $16, %%mm5 \n\t" | ||
973 | "por %%mm4, %%mm3 \n\t" | ||
974 | "por %%mm5, %%mm3 \n\t" | ||
975 | |||
976 | "movq %%mm0, %%mm6 \n\t" | ||
977 | "movq %%mm3, %%mm7 \n\t" | ||
978 | |||
979 | "movq 8%1, %%mm0 \n\t" | ||
980 | "movq 8%1, %%mm1 \n\t" | ||
981 | "movq 8%1, %%mm2 \n\t" | ||
982 | "pand %2, %%mm0 \n\t" | ||
983 | "pand %3, %%mm1 \n\t" | ||
984 | "pand %4, %%mm2 \n\t" | ||
985 | "psllq $3, %%mm0 \n\t" | ||
986 | "psrlq $2, %%mm1 \n\t" | ||
987 | "psrlq $7, %%mm2 \n\t" | ||
988 | "movq %%mm0, %%mm3 \n\t" | ||
989 | "movq %%mm1, %%mm4 \n\t" | ||
990 | "movq %%mm2, %%mm5 \n\t" | ||
991 | "punpcklwd %5, %%mm0 \n\t" | ||
992 | "punpcklwd %5, %%mm1 \n\t" | ||
993 | "punpcklwd %5, %%mm2 \n\t" | ||
994 | "punpckhwd %5, %%mm3 \n\t" | ||
995 | "punpckhwd %5, %%mm4 \n\t" | ||
996 | "punpckhwd %5, %%mm5 \n\t" | ||
997 | "psllq $8, %%mm1 \n\t" | ||
998 | "psllq $16, %%mm2 \n\t" | ||
999 | "por %%mm1, %%mm0 \n\t" | ||
1000 | "por %%mm2, %%mm0 \n\t" | ||
1001 | "psllq $8, %%mm4 \n\t" | ||
1002 | "psllq $16, %%mm5 \n\t" | ||
1003 | "por %%mm4, %%mm3 \n\t" | ||
1004 | "por %%mm5, %%mm3 \n\t" | ||
1005 | |||
1006 | :"=m"(*d) | ||
1007 | :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) | ||
1008 | :"memory"); | ||
1009 | /* borrowed 32 to 24 */ | ||
1010 | asm volatile( | ||
1011 | "movq %%mm0, %%mm4 \n\t" | ||
1012 | "movq %%mm3, %%mm5 \n\t" | ||
1013 | "movq %%mm6, %%mm0 \n\t" | ||
1014 | "movq %%mm7, %%mm1 \n\t" | ||
1015 | |||
1016 | "movq %%mm4, %%mm6 \n\t" | ||
1017 | "movq %%mm5, %%mm7 \n\t" | ||
1018 | "movq %%mm0, %%mm2 \n\t" | ||
1019 | "movq %%mm1, %%mm3 \n\t" | ||
1020 | |||
1021 | "psrlq $8, %%mm2 \n\t" | ||
1022 | "psrlq $8, %%mm3 \n\t" | ||
1023 | "psrlq $8, %%mm6 \n\t" | ||
1024 | "psrlq $8, %%mm7 \n\t" | ||
1025 | "pand %2, %%mm0 \n\t" | ||
1026 | "pand %2, %%mm1 \n\t" | ||
1027 | "pand %2, %%mm4 \n\t" | ||
1028 | "pand %2, %%mm5 \n\t" | ||
1029 | "pand %3, %%mm2 \n\t" | ||
1030 | "pand %3, %%mm3 \n\t" | ||
1031 | "pand %3, %%mm6 \n\t" | ||
1032 | "pand %3, %%mm7 \n\t" | ||
1033 | "por %%mm2, %%mm0 \n\t" | ||
1034 | "por %%mm3, %%mm1 \n\t" | ||
1035 | "por %%mm6, %%mm4 \n\t" | ||
1036 | "por %%mm7, %%mm5 \n\t" | ||
1037 | |||
1038 | "movq %%mm1, %%mm2 \n\t" | ||
1039 | "movq %%mm4, %%mm3 \n\t" | ||
1040 | "psllq $48, %%mm2 \n\t" | ||
1041 | "psllq $32, %%mm3 \n\t" | ||
1042 | "pand %4, %%mm2 \n\t" | ||
1043 | "pand %5, %%mm3 \n\t" | ||
1044 | "por %%mm2, %%mm0 \n\t" | ||
1045 | "psrlq $16, %%mm1 \n\t" | ||
1046 | "psrlq $32, %%mm4 \n\t" | ||
1047 | "psllq $16, %%mm5 \n\t" | ||
1048 | "por %%mm3, %%mm1 \n\t" | ||
1049 | "pand %6, %%mm5 \n\t" | ||
1050 | "por %%mm5, %%mm4 \n\t" | ||
1051 | |||
1052 | MOVNTQ" %%mm0, %0 \n\t" | ||
1053 | MOVNTQ" %%mm1, 8%0 \n\t" | ||
1054 | MOVNTQ" %%mm4, 16%0" | ||
1055 | |||
1056 | :"=m"(*d) | ||
1057 | :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | ||
1058 | :"memory"); | ||
1059 | d += 24; | ||
1060 | s += 8; | ||
1061 | } | ||
1062 | asm volatile(SFENCE:::"memory"); | ||
1063 | asm volatile(EMMS:::"memory"); | ||
1064 | #endif | ||
1065 | while (s < end) | ||
1066 | { | ||
1067 | register uint16_t bgr; | ||
1068 | bgr = *s++; | ||
1069 | *d++ = (bgr&0x1F)<<3; | ||
1070 | *d++ = (bgr&0x3E0)>>2; | ||
1071 | *d++ = (bgr&0x7C00)>>7; | ||
1072 | } | ||
1073 | } | ||
1074 | |||
1075 | static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size) | ||
1076 | { | ||
1077 | const uint16_t *end; | ||
1078 | #ifdef HAVE_MMX | ||
1079 | const uint16_t *mm_end; | ||
1080 | #endif | ||
1081 | uint8_t *d = (uint8_t *)dst; | ||
1082 | const uint16_t *s = (const uint16_t *)src; | ||
1083 | end = s + src_size/2; | ||
1084 | #ifdef HAVE_MMX | ||
1085 | asm volatile(PREFETCH" %0"::"m"(*s):"memory"); | ||
1086 | mm_end = end - 7; | ||
1087 | while (s < mm_end) | ||
1088 | { | ||
1089 | asm volatile( | ||
1090 | PREFETCH" 32%1 \n\t" | ||
1091 | "movq %1, %%mm0 \n\t" | ||
1092 | "movq %1, %%mm1 \n\t" | ||
1093 | "movq %1, %%mm2 \n\t" | ||
1094 | "pand %2, %%mm0 \n\t" | ||
1095 | "pand %3, %%mm1 \n\t" | ||
1096 | "pand %4, %%mm2 \n\t" | ||
1097 | "psllq $3, %%mm0 \n\t" | ||
1098 | "psrlq $3, %%mm1 \n\t" | ||
1099 | "psrlq $8, %%mm2 \n\t" | ||
1100 | "movq %%mm0, %%mm3 \n\t" | ||
1101 | "movq %%mm1, %%mm4 \n\t" | ||
1102 | "movq %%mm2, %%mm5 \n\t" | ||
1103 | "punpcklwd %5, %%mm0 \n\t" | ||
1104 | "punpcklwd %5, %%mm1 \n\t" | ||
1105 | "punpcklwd %5, %%mm2 \n\t" | ||
1106 | "punpckhwd %5, %%mm3 \n\t" | ||
1107 | "punpckhwd %5, %%mm4 \n\t" | ||
1108 | "punpckhwd %5, %%mm5 \n\t" | ||
1109 | "psllq $8, %%mm1 \n\t" | ||
1110 | "psllq $16, %%mm2 \n\t" | ||
1111 | "por %%mm1, %%mm0 \n\t" | ||
1112 | "por %%mm2, %%mm0 \n\t" | ||
1113 | "psllq $8, %%mm4 \n\t" | ||
1114 | "psllq $16, %%mm5 \n\t" | ||
1115 | "por %%mm4, %%mm3 \n\t" | ||
1116 | "por %%mm5, %%mm3 \n\t" | ||
1117 | |||
1118 | "movq %%mm0, %%mm6 \n\t" | ||
1119 | "movq %%mm3, %%mm7 \n\t" | ||
1120 | |||
1121 | "movq 8%1, %%mm0 \n\t" | ||
1122 | "movq 8%1, %%mm1 \n\t" | ||
1123 | "movq 8%1, %%mm2 \n\t" | ||
1124 | "pand %2, %%mm0 \n\t" | ||
1125 | "pand %3, %%mm1 \n\t" | ||
1126 | "pand %4, %%mm2 \n\t" | ||
1127 | "psllq $3, %%mm0 \n\t" | ||
1128 | "psrlq $3, %%mm1 \n\t" | ||
1129 | "psrlq $8, %%mm2 \n\t" | ||
1130 | "movq %%mm0, %%mm3 \n\t" | ||
1131 | "movq %%mm1, %%mm4 \n\t" | ||
1132 | "movq %%mm2, %%mm5 \n\t" | ||
1133 | "punpcklwd %5, %%mm0 \n\t" | ||
1134 | "punpcklwd %5, %%mm1 \n\t" | ||
1135 | "punpcklwd %5, %%mm2 \n\t" | ||
1136 | "punpckhwd %5, %%mm3 \n\t" | ||
1137 | "punpckhwd %5, %%mm4 \n\t" | ||
1138 | "punpckhwd %5, %%mm5 \n\t" | ||
1139 | "psllq $8, %%mm1 \n\t" | ||
1140 | "psllq $16, %%mm2 \n\t" | ||
1141 | "por %%mm1, %%mm0 \n\t" | ||
1142 | "por %%mm2, %%mm0 \n\t" | ||
1143 | "psllq $8, %%mm4 \n\t" | ||
1144 | "psllq $16, %%mm5 \n\t" | ||
1145 | "por %%mm4, %%mm3 \n\t" | ||
1146 | "por %%mm5, %%mm3 \n\t" | ||
1147 | :"=m"(*d) | ||
1148 | :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) | ||
1149 | :"memory"); | ||
1150 | /* borrowed 32 to 24 */ | ||
1151 | asm volatile( | ||
1152 | "movq %%mm0, %%mm4 \n\t" | ||
1153 | "movq %%mm3, %%mm5 \n\t" | ||
1154 | "movq %%mm6, %%mm0 \n\t" | ||
1155 | "movq %%mm7, %%mm1 \n\t" | ||
1156 | |||
1157 | "movq %%mm4, %%mm6 \n\t" | ||
1158 | "movq %%mm5, %%mm7 \n\t" | ||
1159 | "movq %%mm0, %%mm2 \n\t" | ||
1160 | "movq %%mm1, %%mm3 \n\t" | ||
1161 | |||
1162 | "psrlq $8, %%mm2 \n\t" | ||
1163 | "psrlq $8, %%mm3 \n\t" | ||
1164 | "psrlq $8, %%mm6 \n\t" | ||
1165 | "psrlq $8, %%mm7 \n\t" | ||
1166 | "pand %2, %%mm0 \n\t" | ||
1167 | "pand %2, %%mm1 \n\t" | ||
1168 | "pand %2, %%mm4 \n\t" | ||
1169 | "pand %2, %%mm5 \n\t" | ||
1170 | "pand %3, %%mm2 \n\t" | ||
1171 | "pand %3, %%mm3 \n\t" | ||
1172 | "pand %3, %%mm6 \n\t" | ||
1173 | "pand %3, %%mm7 \n\t" | ||
1174 | "por %%mm2, %%mm0 \n\t" | ||
1175 | "por %%mm3, %%mm1 \n\t" | ||
1176 | "por %%mm6, %%mm4 \n\t" | ||
1177 | "por %%mm7, %%mm5 \n\t" | ||
1178 | |||
1179 | "movq %%mm1, %%mm2 \n\t" | ||
1180 | "movq %%mm4, %%mm3 \n\t" | ||
1181 | "psllq $48, %%mm2 \n\t" | ||
1182 | "psllq $32, %%mm3 \n\t" | ||
1183 | "pand %4, %%mm2 \n\t" | ||
1184 | "pand %5, %%mm3 \n\t" | ||
1185 | "por %%mm2, %%mm0 \n\t" | ||
1186 | "psrlq $16, %%mm1 \n\t" | ||
1187 | "psrlq $32, %%mm4 \n\t" | ||
1188 | "psllq $16, %%mm5 \n\t" | ||
1189 | "por %%mm3, %%mm1 \n\t" | ||
1190 | "pand %6, %%mm5 \n\t" | ||
1191 | "por %%mm5, %%mm4 \n\t" | ||
1192 | |||
1193 | MOVNTQ" %%mm0, %0 \n\t" | ||
1194 | MOVNTQ" %%mm1, 8%0 \n\t" | ||
1195 | MOVNTQ" %%mm4, 16%0" | ||
1196 | |||
1197 | :"=m"(*d) | ||
1198 | :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) | ||
1199 | :"memory"); | ||
1200 | d += 24; | ||
1201 | s += 8; | ||
1202 | } | ||
1203 | asm volatile(SFENCE:::"memory"); | ||
1204 | asm volatile(EMMS:::"memory"); | ||
1205 | #endif | ||
1206 | while (s < end) | ||
1207 | { | ||
1208 | register uint16_t bgr; | ||
1209 | bgr = *s++; | ||
1210 | *d++ = (bgr&0x1F)<<3; | ||
1211 | *d++ = (bgr&0x7E0)>>3; | ||
1212 | *d++ = (bgr&0xF800)>>8; | ||
1213 | } | ||
1214 | } | ||
1215 | |||
1216 | static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size) | ||
1217 | { | ||
1218 | const uint16_t *end; | ||
1219 | #ifdef HAVE_MMX | ||
1220 | const uint16_t *mm_end; | ||
1221 | #endif | ||
1222 | uint8_t *d = dst; | ||
1223 | const uint16_t *s = (const uint16_t *)src; | ||
1224 | end = s + src_size/2; | ||
1225 | #ifdef HAVE_MMX | ||
1226 | asm volatile(PREFETCH" %0"::"m"(*s):"memory"); | ||
1227 | asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | ||
1228 | mm_end = end - 3; | ||
1229 | while (s < mm_end) | ||
1230 | { | ||
1231 | asm volatile( | ||
1232 | PREFETCH" 32%1 \n\t" | ||
1233 | "movq %1, %%mm0 \n\t" | ||
1234 | "movq %1, %%mm1 \n\t" | ||
1235 | "movq %1, %%mm2 \n\t" | ||
1236 | "pand %2, %%mm0 \n\t" | ||
1237 | "pand %3, %%mm1 \n\t" | ||
1238 | "pand %4, %%mm2 \n\t" | ||
1239 | "psllq $3, %%mm0 \n\t" | ||
1240 | "psrlq $2, %%mm1 \n\t" | ||
1241 | "psrlq $7, %%mm2 \n\t" | ||
1242 | "movq %%mm0, %%mm3 \n\t" | ||
1243 | "movq %%mm1, %%mm4 \n\t" | ||
1244 | "movq %%mm2, %%mm5 \n\t" | ||
1245 | "punpcklwd %%mm7, %%mm0 \n\t" | ||
1246 | "punpcklwd %%mm7, %%mm1 \n\t" | ||
1247 | "punpcklwd %%mm7, %%mm2 \n\t" | ||
1248 | "punpckhwd %%mm7, %%mm3 \n\t" | ||
1249 | "punpckhwd %%mm7, %%mm4 \n\t" | ||
1250 | "punpckhwd %%mm7, %%mm5 \n\t" | ||
1251 | "psllq $8, %%mm1 \n\t" | ||
1252 | "psllq $16, %%mm2 \n\t" | ||
1253 | "por %%mm1, %%mm0 \n\t" | ||
1254 | "por %%mm2, %%mm0 \n\t" | ||
1255 | "psllq $8, %%mm4 \n\t" | ||
1256 | "psllq $16, %%mm5 \n\t" | ||
1257 | "por %%mm4, %%mm3 \n\t" | ||
1258 | "por %%mm5, %%mm3 \n\t" | ||
1259 | MOVNTQ" %%mm0, %0 \n\t" | ||
1260 | MOVNTQ" %%mm3, 8%0 \n\t" | ||
1261 | :"=m"(*d) | ||
1262 | :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) | ||
1263 | :"memory"); | ||
1264 | d += 16; | ||
1265 | s += 4; | ||
1266 | } | ||
1267 | asm volatile(SFENCE:::"memory"); | ||
1268 | asm volatile(EMMS:::"memory"); | ||
1269 | #endif | ||
1270 | while (s < end) | ||
1271 | { | ||
1272 | #if 0 //slightly slower on Athlon | ||
1273 | int bgr= *s++; | ||
1274 | *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9); | ||
1275 | #else | ||
1276 | register uint16_t bgr; | ||
1277 | bgr = *s++; | ||
1278 | #ifdef WORDS_BIGENDIAN | ||
1279 | *d++ = 0; | ||
1280 | *d++ = (bgr&0x7C00)>>7; | ||
1281 | *d++ = (bgr&0x3E0)>>2; | ||
1282 | *d++ = (bgr&0x1F)<<3; | ||
1283 | #else | ||
1284 | *d++ = (bgr&0x1F)<<3; | ||
1285 | *d++ = (bgr&0x3E0)>>2; | ||
1286 | *d++ = (bgr&0x7C00)>>7; | ||
1287 | *d++ = 0; | ||
1288 | #endif | ||
1289 | |||
1290 | #endif | ||
1291 | } | ||
1292 | } | ||
1293 | |||
1294 | static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size) | ||
1295 | { | ||
1296 | const uint16_t *end; | ||
1297 | #ifdef HAVE_MMX | ||
1298 | const uint16_t *mm_end; | ||
1299 | #endif | ||
1300 | uint8_t *d = dst; | ||
1301 | const uint16_t *s = (const uint16_t*)src; | ||
1302 | end = s + src_size/2; | ||
1303 | #ifdef HAVE_MMX | ||
1304 | asm volatile(PREFETCH" %0"::"m"(*s):"memory"); | ||
1305 | asm volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); | ||
1306 | mm_end = end - 3; | ||
1307 | while (s < mm_end) | ||
1308 | { | ||
1309 | asm volatile( | ||
1310 | PREFETCH" 32%1 \n\t" | ||
1311 | "movq %1, %%mm0 \n\t" | ||
1312 | "movq %1, %%mm1 \n\t" | ||
1313 | "movq %1, %%mm2 \n\t" | ||
1314 | "pand %2, %%mm0 \n\t" | ||
1315 | "pand %3, %%mm1 \n\t" | ||
1316 | "pand %4, %%mm2 \n\t" | ||
1317 | "psllq $3, %%mm0 \n\t" | ||
1318 | "psrlq $3, %%mm1 \n\t" | ||
1319 | "psrlq $8, %%mm2 \n\t" | ||
1320 | "movq %%mm0, %%mm3 \n\t" | ||
1321 | "movq %%mm1, %%mm4 \n\t" | ||
1322 | "movq %%mm2, %%mm5 \n\t" | ||
1323 | "punpcklwd %%mm7, %%mm0 \n\t" | ||
1324 | "punpcklwd %%mm7, %%mm1 \n\t" | ||
1325 | "punpcklwd %%mm7, %%mm2 \n\t" | ||
1326 | "punpckhwd %%mm7, %%mm3 \n\t" | ||
1327 | "punpckhwd %%mm7, %%mm4 \n\t" | ||
1328 | "punpckhwd %%mm7, %%mm5 \n\t" | ||
1329 | "psllq $8, %%mm1 \n\t" | ||
1330 | "psllq $16, %%mm2 \n\t" | ||
1331 | "por %%mm1, %%mm0 \n\t" | ||
1332 | "por %%mm2, %%mm0 \n\t" | ||
1333 | "psllq $8, %%mm4 \n\t" | ||
1334 | "psllq $16, %%mm5 \n\t" | ||
1335 | "por %%mm4, %%mm3 \n\t" | ||
1336 | "por %%mm5, %%mm3 \n\t" | ||
1337 | MOVNTQ" %%mm0, %0 \n\t" | ||
1338 | MOVNTQ" %%mm3, 8%0 \n\t" | ||
1339 | :"=m"(*d) | ||
1340 | :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) | ||
1341 | :"memory"); | ||
1342 | d += 16; | ||
1343 | s += 4; | ||
1344 | } | ||
1345 | asm volatile(SFENCE:::"memory"); | ||
1346 | asm volatile(EMMS:::"memory"); | ||
1347 | #endif | ||
1348 | while (s < end) | ||
1349 | { | ||
1350 | register uint16_t bgr; | ||
1351 | bgr = *s++; | ||
1352 | #ifdef WORDS_BIGENDIAN | ||
1353 | *d++ = 0; | ||
1354 | *d++ = (bgr&0xF800)>>8; | ||
1355 | *d++ = (bgr&0x7E0)>>3; | ||
1356 | *d++ = (bgr&0x1F)<<3; | ||
1357 | #else | ||
1358 | *d++ = (bgr&0x1F)<<3; | ||
1359 | *d++ = (bgr&0x7E0)>>3; | ||
1360 | *d++ = (bgr&0xF800)>>8; | ||
1361 | *d++ = 0; | ||
1362 | #endif | ||
1363 | } | ||
1364 | } | ||
1365 | |||
1366 | static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size) | ||
1367 | { | ||
1368 | long idx = 15 - src_size; | ||
1369 | const uint8_t *s = src-idx; | ||
1370 | uint8_t *d = dst-idx; | ||
1371 | #ifdef HAVE_MMX | ||
1372 | asm volatile( | ||
1373 | "test %0, %0 \n\t" | ||
1374 | "jns 2f \n\t" | ||
1375 | PREFETCH" (%1, %0) \n\t" | ||
1376 | "movq %3, %%mm7 \n\t" | ||
1377 | "pxor %4, %%mm7 \n\t" | ||
1378 | "movq %%mm7, %%mm6 \n\t" | ||
1379 | "pxor %5, %%mm7 \n\t" | ||
1380 | ASMALIGN(4) | ||
1381 | "1: \n\t" | ||
1382 | PREFETCH" 32(%1, %0) \n\t" | ||
1383 | "movq (%1, %0), %%mm0 \n\t" | ||
1384 | "movq 8(%1, %0), %%mm1 \n\t" | ||
1385 | # ifdef HAVE_MMX2 | ||
1386 | "pshufw $177, %%mm0, %%mm3 \n\t" | ||
1387 | "pshufw $177, %%mm1, %%mm5 \n\t" | ||
1388 | "pand %%mm7, %%mm0 \n\t" | ||
1389 | "pand %%mm6, %%mm3 \n\t" | ||
1390 | "pand %%mm7, %%mm1 \n\t" | ||
1391 | "pand %%mm6, %%mm5 \n\t" | ||
1392 | "por %%mm3, %%mm0 \n\t" | ||
1393 | "por %%mm5, %%mm1 \n\t" | ||
1394 | # else | ||
1395 | "movq %%mm0, %%mm2 \n\t" | ||
1396 | "movq %%mm1, %%mm4 \n\t" | ||
1397 | "pand %%mm7, %%mm0 \n\t" | ||
1398 | "pand %%mm6, %%mm2 \n\t" | ||
1399 | "pand %%mm7, %%mm1 \n\t" | ||
1400 | "pand %%mm6, %%mm4 \n\t" | ||
1401 | "movq %%mm2, %%mm3 \n\t" | ||
1402 | "movq %%mm4, %%mm5 \n\t" | ||
1403 | "pslld $16, %%mm2 \n\t" | ||
1404 | "psrld $16, %%mm3 \n\t" | ||
1405 | "pslld $16, %%mm4 \n\t" | ||
1406 | "psrld $16, %%mm5 \n\t" | ||
1407 | "por %%mm2, %%mm0 \n\t" | ||
1408 | "por %%mm4, %%mm1 \n\t" | ||
1409 | "por %%mm3, %%mm0 \n\t" | ||
1410 | "por %%mm5, %%mm1 \n\t" | ||
1411 | # endif | ||
1412 | MOVNTQ" %%mm0, (%2, %0) \n\t" | ||
1413 | MOVNTQ" %%mm1, 8(%2, %0) \n\t" | ||
1414 | "add $16, %0 \n\t" | ||
1415 | "js 1b \n\t" | ||
1416 | SFENCE" \n\t" | ||
1417 | EMMS" \n\t" | ||
1418 | "2: \n\t" | ||
1419 | : "+&r"(idx) | ||
1420 | : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) | ||
1421 | : "memory"); | ||
1422 | #endif | ||
1423 | for (; idx<15; idx+=4) { | ||
1424 | register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; | ||
1425 | v &= 0xff00ff; | ||
1426 | *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); | ||
1427 | } | ||
1428 | } | ||
1429 | |||
1430 | static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size) | ||
1431 | { | ||
1432 | unsigned i; | ||
1433 | #ifdef HAVE_MMX | ||
1434 | long mmx_size= 23 - src_size; | ||
1435 | asm volatile ( | ||
1436 | "test %%"REG_a", %%"REG_a" \n\t" | ||
1437 | "jns 2f \n\t" | ||
1438 | "movq "MANGLE(mask24r)", %%mm5 \n\t" | ||
1439 | "movq "MANGLE(mask24g)", %%mm6 \n\t" | ||
1440 | "movq "MANGLE(mask24b)", %%mm7 \n\t" | ||
1441 | ASMALIGN(4) | ||
1442 | "1: \n\t" | ||
1443 | PREFETCH" 32(%1, %%"REG_a") \n\t" | ||
1444 | "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1445 | "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG | ||
1446 | "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B | ||
1447 | "psllq $16, %%mm0 \n\t" // 00 BGR BGR | ||
1448 | "pand %%mm5, %%mm0 \n\t" | ||
1449 | "pand %%mm6, %%mm1 \n\t" | ||
1450 | "pand %%mm7, %%mm2 \n\t" | ||
1451 | "por %%mm0, %%mm1 \n\t" | ||
1452 | "por %%mm2, %%mm1 \n\t" | ||
1453 | "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | ||
1454 | MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG | ||
1455 | "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B | ||
1456 | "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR | ||
1457 | "pand %%mm7, %%mm0 \n\t" | ||
1458 | "pand %%mm5, %%mm1 \n\t" | ||
1459 | "pand %%mm6, %%mm2 \n\t" | ||
1460 | "por %%mm0, %%mm1 \n\t" | ||
1461 | "por %%mm2, %%mm1 \n\t" | ||
1462 | "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B | ||
1463 | MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R | ||
1464 | "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR | ||
1465 | "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG | ||
1466 | "pand %%mm6, %%mm0 \n\t" | ||
1467 | "pand %%mm7, %%mm1 \n\t" | ||
1468 | "pand %%mm5, %%mm2 \n\t" | ||
1469 | "por %%mm0, %%mm1 \n\t" | ||
1470 | "por %%mm2, %%mm1 \n\t" | ||
1471 | MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" | ||
1472 | "add $24, %%"REG_a" \n\t" | ||
1473 | " js 1b \n\t" | ||
1474 | "2: \n\t" | ||
1475 | : "+a" (mmx_size) | ||
1476 | : "r" (src-mmx_size), "r"(dst-mmx_size) | ||
1477 | ); | ||
1478 | |||
1479 | asm volatile(SFENCE:::"memory"); | ||
1480 | asm volatile(EMMS:::"memory"); | ||
1481 | |||
1482 | if (mmx_size==23) return; //finished, was multiple of 8 | ||
1483 | |||
1484 | src+= src_size; | ||
1485 | dst+= src_size; | ||
1486 | src_size= 23-mmx_size; | ||
1487 | src-= src_size; | ||
1488 | dst-= src_size; | ||
1489 | #endif | ||
1490 | for (i=0; i<src_size; i+=3) | ||
1491 | { | ||
1492 | register uint8_t x; | ||
1493 | x = src[i + 2]; | ||
1494 | dst[i + 1] = src[i + 1]; | ||
1495 | dst[i + 2] = src[i + 0]; | ||
1496 | dst[i + 0] = x; | ||
1497 | } | ||
1498 | } | ||
1499 | |||
1500 | static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
1501 | long width, long height, | ||
1502 | long lumStride, long chromStride, long dstStride, long vertLumPerChroma) | ||
1503 | { | ||
1504 | long y; | ||
1505 | const long chromWidth= width>>1; | ||
1506 | for (y=0; y<height; y++) | ||
1507 | { | ||
1508 | #ifdef HAVE_MMX | ||
1509 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1510 | asm volatile( | ||
1511 | "xor %%"REG_a", %%"REG_a" \n\t" | ||
1512 | ASMALIGN(4) | ||
1513 | "1: \n\t" | ||
1514 | PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | ||
1515 | PREFETCH" 32(%2, %%"REG_a") \n\t" | ||
1516 | PREFETCH" 32(%3, %%"REG_a") \n\t" | ||
1517 | "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | ||
1518 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1519 | "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | ||
1520 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1521 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1522 | |||
1523 | "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | ||
1524 | "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | ||
1525 | "movq %%mm3, %%mm4 \n\t" // Y(0) | ||
1526 | "movq %%mm5, %%mm6 \n\t" // Y(8) | ||
1527 | "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | ||
1528 | "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) | ||
1529 | "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | ||
1530 | "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | ||
1531 | |||
1532 | MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" | ||
1533 | MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" | ||
1534 | MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" | ||
1535 | MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" | ||
1536 | |||
1537 | "add $8, %%"REG_a" \n\t" | ||
1538 | "cmp %4, %%"REG_a" \n\t" | ||
1539 | " jb 1b \n\t" | ||
1540 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1541 | : "%"REG_a | ||
1542 | ); | ||
1543 | #else | ||
1544 | |||
1545 | #if defined ARCH_ALPHA && defined HAVE_MVI | ||
1546 | #define pl2yuy2(n) \ | ||
1547 | y1 = yc[n]; \ | ||
1548 | y2 = yc2[n]; \ | ||
1549 | u = uc[n]; \ | ||
1550 | v = vc[n]; \ | ||
1551 | asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \ | ||
1552 | asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \ | ||
1553 | asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \ | ||
1554 | asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \ | ||
1555 | yuv1 = (u << 8) + (v << 24); \ | ||
1556 | yuv2 = yuv1 + y2; \ | ||
1557 | yuv1 += y1; \ | ||
1558 | qdst[n] = yuv1; \ | ||
1559 | qdst2[n] = yuv2; | ||
1560 | |||
1561 | int i; | ||
1562 | uint64_t *qdst = (uint64_t *) dst; | ||
1563 | uint64_t *qdst2 = (uint64_t *) (dst + dstStride); | ||
1564 | const uint32_t *yc = (uint32_t *) ysrc; | ||
1565 | const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride); | ||
1566 | const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc; | ||
1567 | for (i = 0; i < chromWidth; i += 8){ | ||
1568 | uint64_t y1, y2, yuv1, yuv2; | ||
1569 | uint64_t u, v; | ||
1570 | /* Prefetch */ | ||
1571 | asm("ldq $31,64(%0)" :: "r"(yc)); | ||
1572 | asm("ldq $31,64(%0)" :: "r"(yc2)); | ||
1573 | asm("ldq $31,64(%0)" :: "r"(uc)); | ||
1574 | asm("ldq $31,64(%0)" :: "r"(vc)); | ||
1575 | |||
1576 | pl2yuy2(0); | ||
1577 | pl2yuy2(1); | ||
1578 | pl2yuy2(2); | ||
1579 | pl2yuy2(3); | ||
1580 | |||
1581 | yc += 4; | ||
1582 | yc2 += 4; | ||
1583 | uc += 4; | ||
1584 | vc += 4; | ||
1585 | qdst += 4; | ||
1586 | qdst2 += 4; | ||
1587 | } | ||
1588 | y++; | ||
1589 | ysrc += lumStride; | ||
1590 | dst += dstStride; | ||
1591 | |||
1592 | #elif __WORDSIZE >= 64 | ||
1593 | int i; | ||
1594 | uint64_t *ldst = (uint64_t *) dst; | ||
1595 | const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | ||
1596 | for (i = 0; i < chromWidth; i += 2){ | ||
1597 | uint64_t k, l; | ||
1598 | k = yc[0] + (uc[0] << 8) + | ||
1599 | (yc[1] << 16) + (vc[0] << 24); | ||
1600 | l = yc[2] + (uc[1] << 8) + | ||
1601 | (yc[3] << 16) + (vc[1] << 24); | ||
1602 | *ldst++ = k + (l << 32); | ||
1603 | yc += 4; | ||
1604 | uc += 2; | ||
1605 | vc += 2; | ||
1606 | } | ||
1607 | |||
1608 | #else | ||
1609 | int i, *idst = (int32_t *) dst; | ||
1610 | const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | ||
1611 | for (i = 0; i < chromWidth; i++){ | ||
1612 | #ifdef WORDS_BIGENDIAN | ||
1613 | *idst++ = (yc[0] << 24)+ (uc[0] << 16) + | ||
1614 | (yc[1] << 8) + (vc[0] << 0); | ||
1615 | #else | ||
1616 | *idst++ = yc[0] + (uc[0] << 8) + | ||
1617 | (yc[1] << 16) + (vc[0] << 24); | ||
1618 | #endif | ||
1619 | yc += 2; | ||
1620 | uc++; | ||
1621 | vc++; | ||
1622 | } | ||
1623 | #endif | ||
1624 | #endif | ||
1625 | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) | ||
1626 | { | ||
1627 | usrc += chromStride; | ||
1628 | vsrc += chromStride; | ||
1629 | } | ||
1630 | ysrc += lumStride; | ||
1631 | dst += dstStride; | ||
1632 | } | ||
1633 | #ifdef HAVE_MMX | ||
1634 | asm( EMMS" \n\t" | ||
1635 | SFENCE" \n\t" | ||
1636 | :::"memory"); | ||
1637 | #endif | ||
1638 | } | ||
1639 | |||
1640 | /** | ||
1641 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1642 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1643 | */ | ||
1644 | static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
1645 | long width, long height, | ||
1646 | long lumStride, long chromStride, long dstStride) | ||
1647 | { | ||
1648 | //FIXME interpolate chroma | ||
1649 | RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | ||
1650 | } | ||
1651 | |||
1652 | static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
1653 | long width, long height, | ||
1654 | long lumStride, long chromStride, long dstStride, long vertLumPerChroma) | ||
1655 | { | ||
1656 | long y; | ||
1657 | const long chromWidth= width>>1; | ||
1658 | for (y=0; y<height; y++) | ||
1659 | { | ||
1660 | #ifdef HAVE_MMX | ||
1661 | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | ||
1662 | asm volatile( | ||
1663 | "xor %%"REG_a", %%"REG_a" \n\t" | ||
1664 | ASMALIGN(4) | ||
1665 | "1: \n\t" | ||
1666 | PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | ||
1667 | PREFETCH" 32(%2, %%"REG_a") \n\t" | ||
1668 | PREFETCH" 32(%3, %%"REG_a") \n\t" | ||
1669 | "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | ||
1670 | "movq %%mm0, %%mm2 \n\t" // U(0) | ||
1671 | "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | ||
1672 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1673 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | ||
1674 | |||
1675 | "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | ||
1676 | "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | ||
1677 | "movq %%mm0, %%mm4 \n\t" // Y(0) | ||
1678 | "movq %%mm2, %%mm6 \n\t" // Y(8) | ||
1679 | "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | ||
1680 | "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) | ||
1681 | "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | ||
1682 | "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | ||
1683 | |||
1684 | MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" | ||
1685 | MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" | ||
1686 | MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" | ||
1687 | MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" | ||
1688 | |||
1689 | "add $8, %%"REG_a" \n\t" | ||
1690 | "cmp %4, %%"REG_a" \n\t" | ||
1691 | " jb 1b \n\t" | ||
1692 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | ||
1693 | : "%"REG_a | ||
1694 | ); | ||
1695 | #else | ||
1696 | //FIXME adapt the Alpha ASM code from yv12->yuy2 | ||
1697 | |||
1698 | #if __WORDSIZE >= 64 | ||
1699 | int i; | ||
1700 | uint64_t *ldst = (uint64_t *) dst; | ||
1701 | const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | ||
1702 | for (i = 0; i < chromWidth; i += 2){ | ||
1703 | uint64_t k, l; | ||
1704 | k = uc[0] + (yc[0] << 8) + | ||
1705 | (vc[0] << 16) + (yc[1] << 24); | ||
1706 | l = uc[1] + (yc[2] << 8) + | ||
1707 | (vc[1] << 16) + (yc[3] << 24); | ||
1708 | *ldst++ = k + (l << 32); | ||
1709 | yc += 4; | ||
1710 | uc += 2; | ||
1711 | vc += 2; | ||
1712 | } | ||
1713 | |||
1714 | #else | ||
1715 | int i, *idst = (int32_t *) dst; | ||
1716 | const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc; | ||
1717 | for (i = 0; i < chromWidth; i++){ | ||
1718 | #ifdef WORDS_BIGENDIAN | ||
1719 | *idst++ = (uc[0] << 24)+ (yc[0] << 16) + | ||
1720 | (vc[0] << 8) + (yc[1] << 0); | ||
1721 | #else | ||
1722 | *idst++ = uc[0] + (yc[0] << 8) + | ||
1723 | (vc[0] << 16) + (yc[1] << 24); | ||
1724 | #endif | ||
1725 | yc += 2; | ||
1726 | uc++; | ||
1727 | vc++; | ||
1728 | } | ||
1729 | #endif | ||
1730 | #endif | ||
1731 | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) | ||
1732 | { | ||
1733 | usrc += chromStride; | ||
1734 | vsrc += chromStride; | ||
1735 | } | ||
1736 | ysrc += lumStride; | ||
1737 | dst += dstStride; | ||
1738 | } | ||
1739 | #ifdef HAVE_MMX | ||
1740 | asm( EMMS" \n\t" | ||
1741 | SFENCE" \n\t" | ||
1742 | :::"memory"); | ||
1743 | #endif | ||
1744 | } | ||
1745 | |||
1746 | /** | ||
1747 | * Height should be a multiple of 2 and width should be a multiple of 16 | ||
1748 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1749 | */ | ||
1750 | static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
1751 | long width, long height, | ||
1752 | long lumStride, long chromStride, long dstStride) | ||
1753 | { | ||
1754 | //FIXME interpolate chroma | ||
1755 | RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); | ||
1756 | } | ||
1757 | |||
1758 | /** | ||
1759 | * Width should be a multiple of 16. | ||
1760 | */ | ||
1761 | static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, | ||
1762 | long width, long height, | ||
1763 | long lumStride, long chromStride, long dstStride) | ||
1764 | { | ||
1765 | RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); | ||
1766 | } | ||
1767 | |||
1768 | /** | ||
1769 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
1770 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
1771 | */ | ||
1772 | static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
1773 | long width, long height, | ||
1774 | long lumStride, long chromStride, long srcStride) | ||
1775 | { | ||
1776 | long y; | ||
1777 | const long chromWidth= width>>1; | ||
1778 | for (y=0; y<height; y+=2) | ||
1779 | { | ||
1780 | #ifdef HAVE_MMX | ||
1781 | asm volatile( | ||
1782 | "xor %%"REG_a", %%"REG_a" \n\t" | ||
1783 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
1784 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | ||
1785 | ASMALIGN(4) | ||
1786 | "1: \n\t" | ||
1787 | PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | ||
1788 | "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1789 | "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1790 | "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | ||
1791 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | ||
1792 | "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | ||
1793 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) | ||
1794 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | ||
1795 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | ||
1796 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
1797 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | ||
1798 | |||
1799 | MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" | ||
1800 | |||
1801 | "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | ||
1802 | "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | ||
1803 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | ||
1804 | "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | ||
1805 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | ||
1806 | "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) | ||
1807 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | ||
1808 | "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | ||
1809 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | ||
1810 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | ||
1811 | |||
1812 | MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" | ||
1813 | |||
1814 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | ||
1815 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | ||
1816 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | ||
1817 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | ||
1818 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | ||
1819 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | ||
1820 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | ||
1821 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | ||
1822 | |||
1823 | MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" | ||
1824 | MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" | ||
1825 | |||
1826 | "add $8, %%"REG_a" \n\t" | ||
1827 | "cmp %4, %%"REG_a" \n\t" | ||
1828 | " jb 1b \n\t" | ||
1829 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1830 | : "memory", "%"REG_a | ||
1831 | ); | ||
1832 | |||
1833 | ydst += lumStride; | ||
1834 | src += srcStride; | ||
1835 | |||
1836 | asm volatile( | ||
1837 | "xor %%"REG_a", %%"REG_a" \n\t" | ||
1838 | ASMALIGN(4) | ||
1839 | "1: \n\t" | ||
1840 | PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | ||
1841 | "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
1842 | "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
1843 | "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | ||
1844 | "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | ||
1845 | "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | ||
1846 | "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | ||
1847 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | ||
1848 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | ||
1849 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | ||
1850 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | ||
1851 | |||
1852 | MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" | ||
1853 | MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" | ||
1854 | |||
1855 | "add $8, %%"REG_a" \n\t" | ||
1856 | "cmp %4, %%"REG_a" \n\t" | ||
1857 | " jb 1b \n\t" | ||
1858 | |||
1859 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
1860 | : "memory", "%"REG_a | ||
1861 | ); | ||
1862 | #else | ||
1863 | long i; | ||
1864 | for (i=0; i<chromWidth; i++) | ||
1865 | { | ||
1866 | ydst[2*i+0] = src[4*i+0]; | ||
1867 | udst[i] = src[4*i+1]; | ||
1868 | ydst[2*i+1] = src[4*i+2]; | ||
1869 | vdst[i] = src[4*i+3]; | ||
1870 | } | ||
1871 | ydst += lumStride; | ||
1872 | src += srcStride; | ||
1873 | |||
1874 | for (i=0; i<chromWidth; i++) | ||
1875 | { | ||
1876 | ydst[2*i+0] = src[4*i+0]; | ||
1877 | ydst[2*i+1] = src[4*i+2]; | ||
1878 | } | ||
1879 | #endif | ||
1880 | udst += chromStride; | ||
1881 | vdst += chromStride; | ||
1882 | ydst += lumStride; | ||
1883 | src += srcStride; | ||
1884 | } | ||
1885 | #ifdef HAVE_MMX | ||
1886 | asm volatile( EMMS" \n\t" | ||
1887 | SFENCE" \n\t" | ||
1888 | :::"memory"); | ||
1889 | #endif | ||
1890 | } | ||
1891 | |||
1892 | static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, | ||
1893 | uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
1894 | long width, long height, long lumStride, long chromStride) | ||
1895 | { | ||
1896 | /* Y Plane */ | ||
1897 | memcpy(ydst, ysrc, width*height); | ||
1898 | |||
1899 | /* XXX: implement upscaling for U,V */ | ||
1900 | } | ||
1901 | |||
1902 | static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride) | ||
1903 | { | ||
1904 | long x,y; | ||
1905 | |||
1906 | dst[0]= src[0]; | ||
1907 | |||
1908 | // first line | ||
1909 | for (x=0; x<srcWidth-1; x++){ | ||
1910 | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | ||
1911 | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | ||
1912 | } | ||
1913 | dst[2*srcWidth-1]= src[srcWidth-1]; | ||
1914 | |||
1915 | dst+= dstStride; | ||
1916 | |||
1917 | for (y=1; y<srcHeight; y++){ | ||
1918 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | ||
1919 | const long mmxSize= srcWidth&~15; | ||
1920 | asm volatile( | ||
1921 | "mov %4, %%"REG_a" \n\t" | ||
1922 | "1: \n\t" | ||
1923 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | ||
1924 | "movq (%1, %%"REG_a"), %%mm1 \n\t" | ||
1925 | "movq 1(%0, %%"REG_a"), %%mm2 \n\t" | ||
1926 | "movq 1(%1, %%"REG_a"), %%mm3 \n\t" | ||
1927 | "movq -1(%0, %%"REG_a"), %%mm4 \n\t" | ||
1928 | "movq -1(%1, %%"REG_a"), %%mm5 \n\t" | ||
1929 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1930 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1931 | PAVGB" %%mm0, %%mm5 \n\t" | ||
1932 | PAVGB" %%mm0, %%mm3 \n\t" | ||
1933 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1934 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1935 | PAVGB" %%mm1, %%mm4 \n\t" | ||
1936 | PAVGB" %%mm1, %%mm2 \n\t" | ||
1937 | "movq %%mm5, %%mm7 \n\t" | ||
1938 | "movq %%mm4, %%mm6 \n\t" | ||
1939 | "punpcklbw %%mm3, %%mm5 \n\t" | ||
1940 | "punpckhbw %%mm3, %%mm7 \n\t" | ||
1941 | "punpcklbw %%mm2, %%mm4 \n\t" | ||
1942 | "punpckhbw %%mm2, %%mm6 \n\t" | ||
1943 | #if 1 | ||
1944 | MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" | ||
1945 | MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" | ||
1946 | MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" | ||
1947 | MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" | ||
1948 | #else | ||
1949 | "movq %%mm5, (%2, %%"REG_a", 2) \n\t" | ||
1950 | "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t" | ||
1951 | "movq %%mm4, (%3, %%"REG_a", 2) \n\t" | ||
1952 | "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t" | ||
1953 | #endif | ||
1954 | "add $8, %%"REG_a" \n\t" | ||
1955 | " js 1b \n\t" | ||
1956 | :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | ||
1957 | "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | ||
1958 | "g" (-mmxSize) | ||
1959 | : "%"REG_a | ||
1960 | |||
1961 | ); | ||
1962 | #else | ||
1963 | const long mmxSize=1; | ||
1964 | #endif | ||
1965 | dst[0 ]= (3*src[0] + src[srcStride])>>2; | ||
1966 | dst[dstStride]= ( src[0] + 3*src[srcStride])>>2; | ||
1967 | |||
1968 | for (x=mmxSize-1; x<srcWidth-1; x++){ | ||
1969 | dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; | ||
1970 | dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; | ||
1971 | dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; | ||
1972 | dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; | ||
1973 | } | ||
1974 | dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; | ||
1975 | dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; | ||
1976 | |||
1977 | dst+=dstStride*2; | ||
1978 | src+=srcStride; | ||
1979 | } | ||
1980 | |||
1981 | // last line | ||
1982 | #if 1 | ||
1983 | dst[0]= src[0]; | ||
1984 | |||
1985 | for (x=0; x<srcWidth-1; x++){ | ||
1986 | dst[2*x+1]= (3*src[x] + src[x+1])>>2; | ||
1987 | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; | ||
1988 | } | ||
1989 | dst[2*srcWidth-1]= src[srcWidth-1]; | ||
1990 | #else | ||
1991 | for (x=0; x<srcWidth; x++){ | ||
1992 | dst[2*x+0]= | ||
1993 | dst[2*x+1]= src[x]; | ||
1994 | } | ||
1995 | #endif | ||
1996 | |||
1997 | #ifdef HAVE_MMX | ||
1998 | asm volatile( EMMS" \n\t" | ||
1999 | SFENCE" \n\t" | ||
2000 | :::"memory"); | ||
2001 | #endif | ||
2002 | } | ||
2003 | |||
2004 | /** | ||
2005 | * Height should be a multiple of 2 and width should be a multiple of 16. | ||
2006 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
2007 | * Chrominance data is only taken from every second line, others are ignored. | ||
2008 | * FIXME: Write HQ version. | ||
2009 | */ | ||
2010 | static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2011 | long width, long height, | ||
2012 | long lumStride, long chromStride, long srcStride) | ||
2013 | { | ||
2014 | long y; | ||
2015 | const long chromWidth= width>>1; | ||
2016 | for (y=0; y<height; y+=2) | ||
2017 | { | ||
2018 | #ifdef HAVE_MMX | ||
2019 | asm volatile( | ||
2020 | "xorl %%eax, %%eax \n\t" | ||
2021 | "pcmpeqw %%mm7, %%mm7 \n\t" | ||
2022 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | ||
2023 | ASMALIGN(4) | ||
2024 | "1: \n\t" | ||
2025 | PREFETCH" 64(%0, %%eax, 4) \n\t" | ||
2026 | "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) | ||
2027 | "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) | ||
2028 | "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | ||
2029 | "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | ||
2030 | "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | ||
2031 | "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) | ||
2032 | "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) | ||
2033 | "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) | ||
2034 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | ||
2035 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | ||
2036 | |||
2037 | MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t" | ||
2038 | |||
2039 | "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) | ||
2040 | "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) | ||
2041 | "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | ||
2042 | "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | ||
2043 | "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | ||
2044 | "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) | ||
2045 | "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) | ||
2046 | "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) | ||
2047 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | ||
2048 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | ||
2049 | |||
2050 | MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t" | ||
2051 | |||
2052 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | ||
2053 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | ||
2054 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) | ||
2055 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) | ||
2056 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) | ||
2057 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) | ||
2058 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | ||
2059 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | ||
2060 | |||
2061 | MOVNTQ" %%mm0, (%3, %%eax) \n\t" | ||
2062 | MOVNTQ" %%mm2, (%2, %%eax) \n\t" | ||
2063 | |||
2064 | "addl $8, %%eax \n\t" | ||
2065 | "cmpl %4, %%eax \n\t" | ||
2066 | " jb 1b \n\t" | ||
2067 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
2068 | : "memory", "%eax" | ||
2069 | ); | ||
2070 | |||
2071 | ydst += lumStride; | ||
2072 | src += srcStride; | ||
2073 | |||
2074 | asm volatile( | ||
2075 | "xorl %%eax, %%eax \n\t" | ||
2076 | ASMALIGN(4) | ||
2077 | "1: \n\t" | ||
2078 | PREFETCH" 64(%0, %%eax, 4) \n\t" | ||
2079 | "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) | ||
2080 | "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) | ||
2081 | "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) | ||
2082 | "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) | ||
2083 | "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | ||
2084 | "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | ||
2085 | "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | ||
2086 | "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) | ||
2087 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | ||
2088 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | ||
2089 | |||
2090 | MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t" | ||
2091 | MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t" | ||
2092 | |||
2093 | "addl $8, %%eax \n\t" | ||
2094 | "cmpl %4, %%eax \n\t" | ||
2095 | " jb 1b \n\t" | ||
2096 | |||
2097 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | ||
2098 | : "memory", "%eax" | ||
2099 | ); | ||
2100 | #else | ||
2101 | long i; | ||
2102 | for (i=0; i<chromWidth; i++) | ||
2103 | { | ||
2104 | udst[i] = src[4*i+0]; | ||
2105 | ydst[2*i+0] = src[4*i+1]; | ||
2106 | vdst[i] = src[4*i+2]; | ||
2107 | ydst[2*i+1] = src[4*i+3]; | ||
2108 | } | ||
2109 | ydst += lumStride; | ||
2110 | src += srcStride; | ||
2111 | |||
2112 | for (i=0; i<chromWidth; i++) | ||
2113 | { | ||
2114 | ydst[2*i+0] = src[4*i+1]; | ||
2115 | ydst[2*i+1] = src[4*i+3]; | ||
2116 | } | ||
2117 | #endif | ||
2118 | udst += chromStride; | ||
2119 | vdst += chromStride; | ||
2120 | ydst += lumStride; | ||
2121 | src += srcStride; | ||
2122 | } | ||
2123 | #ifdef HAVE_MMX | ||
2124 | asm volatile( EMMS" \n\t" | ||
2125 | SFENCE" \n\t" | ||
2126 | :::"memory"); | ||
2127 | #endif | ||
2128 | } | ||
2129 | |||
2130 | /** | ||
2131 | * Height should be a multiple of 2 and width should be a multiple of 2. | ||
2132 | * (If this is a problem for anyone then tell me, and I will fix it.) | ||
2133 | * Chrominance data is only taken from every second line, | ||
2134 | * others are ignored in the C version. | ||
2135 | * FIXME: Write HQ version. | ||
2136 | */ | ||
2137 | static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
2138 | long width, long height, | ||
2139 | long lumStride, long chromStride, long srcStride) | ||
2140 | { | ||
2141 | long y; | ||
2142 | const long chromWidth= width>>1; | ||
2143 | #ifdef HAVE_MMX | ||
2144 | for (y=0; y<height-2; y+=2) | ||
2145 | { | ||
2146 | long i; | ||
2147 | for (i=0; i<2; i++) | ||
2148 | { | ||
2149 | asm volatile( | ||
2150 | "mov %2, %%"REG_a" \n\t" | ||
2151 | "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" | ||
2152 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
2153 | "pxor %%mm7, %%mm7 \n\t" | ||
2154 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | ||
2155 | ASMALIGN(4) | ||
2156 | "1: \n\t" | ||
2157 | PREFETCH" 64(%0, %%"REG_d") \n\t" | ||
2158 | "movd (%0, %%"REG_d"), %%mm0 \n\t" | ||
2159 | "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | ||
2160 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
2161 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
2162 | "movd 6(%0, %%"REG_d"), %%mm2 \n\t" | ||
2163 | "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | ||
2164 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2165 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
2166 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
2167 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
2168 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
2169 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
2170 | #ifndef FAST_BGR2YV12 | ||
2171 | "psrad $8, %%mm0 \n\t" | ||
2172 | "psrad $8, %%mm1 \n\t" | ||
2173 | "psrad $8, %%mm2 \n\t" | ||
2174 | "psrad $8, %%mm3 \n\t" | ||
2175 | #endif | ||
2176 | "packssdw %%mm1, %%mm0 \n\t" | ||
2177 | "packssdw %%mm3, %%mm2 \n\t" | ||
2178 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
2179 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
2180 | "packssdw %%mm2, %%mm0 \n\t" | ||
2181 | "psraw $7, %%mm0 \n\t" | ||
2182 | |||
2183 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | ||
2184 | "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | ||
2185 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2186 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
2187 | "movd 18(%0, %%"REG_d"), %%mm2 \n\t" | ||
2188 | "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | ||
2189 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2190 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
2191 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
2192 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
2193 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
2194 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
2195 | #ifndef FAST_BGR2YV12 | ||
2196 | "psrad $8, %%mm4 \n\t" | ||
2197 | "psrad $8, %%mm1 \n\t" | ||
2198 | "psrad $8, %%mm2 \n\t" | ||
2199 | "psrad $8, %%mm3 \n\t" | ||
2200 | #endif | ||
2201 | "packssdw %%mm1, %%mm4 \n\t" | ||
2202 | "packssdw %%mm3, %%mm2 \n\t" | ||
2203 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
2204 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
2205 | "add $24, %%"REG_d" \n\t" | ||
2206 | "packssdw %%mm2, %%mm4 \n\t" | ||
2207 | "psraw $7, %%mm4 \n\t" | ||
2208 | |||
2209 | "packuswb %%mm4, %%mm0 \n\t" | ||
2210 | "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" | ||
2211 | |||
2212 | MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" | ||
2213 | "add $8, %%"REG_a" \n\t" | ||
2214 | " js 1b \n\t" | ||
2215 | : : "r" (src+width*3), "r" (ydst+width), "g" (-width) | ||
2216 | : "%"REG_a, "%"REG_d | ||
2217 | ); | ||
2218 | ydst += lumStride; | ||
2219 | src += srcStride; | ||
2220 | } | ||
2221 | src -= srcStride*2; | ||
2222 | asm volatile( | ||
2223 | "mov %4, %%"REG_a" \n\t" | ||
2224 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
2225 | "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" | ||
2226 | "pxor %%mm7, %%mm7 \n\t" | ||
2227 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | ||
2228 | "add %%"REG_d", %%"REG_d" \n\t" | ||
2229 | ASMALIGN(4) | ||
2230 | "1: \n\t" | ||
2231 | PREFETCH" 64(%0, %%"REG_d") \n\t" | ||
2232 | PREFETCH" 64(%1, %%"REG_d") \n\t" | ||
2233 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | ||
2234 | "movq (%0, %%"REG_d"), %%mm0 \n\t" | ||
2235 | "movq (%1, %%"REG_d"), %%mm1 \n\t" | ||
2236 | "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | ||
2237 | "movq 6(%1, %%"REG_d"), %%mm3 \n\t" | ||
2238 | PAVGB" %%mm1, %%mm0 \n\t" | ||
2239 | PAVGB" %%mm3, %%mm2 \n\t" | ||
2240 | "movq %%mm0, %%mm1 \n\t" | ||
2241 | "movq %%mm2, %%mm3 \n\t" | ||
2242 | "psrlq $24, %%mm0 \n\t" | ||
2243 | "psrlq $24, %%mm2 \n\t" | ||
2244 | PAVGB" %%mm1, %%mm0 \n\t" | ||
2245 | PAVGB" %%mm3, %%mm2 \n\t" | ||
2246 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
2247 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2248 | #else | ||
2249 | "movd (%0, %%"REG_d"), %%mm0 \n\t" | ||
2250 | "movd (%1, %%"REG_d"), %%mm1 \n\t" | ||
2251 | "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | ||
2252 | "movd 3(%1, %%"REG_d"), %%mm3 \n\t" | ||
2253 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
2254 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
2255 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2256 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
2257 | "paddw %%mm1, %%mm0 \n\t" | ||
2258 | "paddw %%mm3, %%mm2 \n\t" | ||
2259 | "paddw %%mm2, %%mm0 \n\t" | ||
2260 | "movd 6(%0, %%"REG_d"), %%mm4 \n\t" | ||
2261 | "movd 6(%1, %%"REG_d"), %%mm1 \n\t" | ||
2262 | "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | ||
2263 | "movd 9(%1, %%"REG_d"), %%mm3 \n\t" | ||
2264 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2265 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
2266 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2267 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
2268 | "paddw %%mm1, %%mm4 \n\t" | ||
2269 | "paddw %%mm3, %%mm2 \n\t" | ||
2270 | "paddw %%mm4, %%mm2 \n\t" | ||
2271 | "psrlw $2, %%mm0 \n\t" | ||
2272 | "psrlw $2, %%mm2 \n\t" | ||
2273 | #endif | ||
2274 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" | ||
2275 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | ||
2276 | |||
2277 | "pmaddwd %%mm0, %%mm1 \n\t" | ||
2278 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2279 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
2280 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
2281 | #ifndef FAST_BGR2YV12 | ||
2282 | "psrad $8, %%mm0 \n\t" | ||
2283 | "psrad $8, %%mm1 \n\t" | ||
2284 | "psrad $8, %%mm2 \n\t" | ||
2285 | "psrad $8, %%mm3 \n\t" | ||
2286 | #endif | ||
2287 | "packssdw %%mm2, %%mm0 \n\t" | ||
2288 | "packssdw %%mm3, %%mm1 \n\t" | ||
2289 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
2290 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
2291 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | ||
2292 | "psraw $7, %%mm0 \n\t" | ||
2293 | |||
2294 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | ||
2295 | "movq 12(%0, %%"REG_d"), %%mm4 \n\t" | ||
2296 | "movq 12(%1, %%"REG_d"), %%mm1 \n\t" | ||
2297 | "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | ||
2298 | "movq 18(%1, %%"REG_d"), %%mm3 \n\t" | ||
2299 | PAVGB" %%mm1, %%mm4 \n\t" | ||
2300 | PAVGB" %%mm3, %%mm2 \n\t" | ||
2301 | "movq %%mm4, %%mm1 \n\t" | ||
2302 | "movq %%mm2, %%mm3 \n\t" | ||
2303 | "psrlq $24, %%mm4 \n\t" | ||
2304 | "psrlq $24, %%mm2 \n\t" | ||
2305 | PAVGB" %%mm1, %%mm4 \n\t" | ||
2306 | PAVGB" %%mm3, %%mm2 \n\t" | ||
2307 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2308 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2309 | #else | ||
2310 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | ||
2311 | "movd 12(%1, %%"REG_d"), %%mm1 \n\t" | ||
2312 | "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | ||
2313 | "movd 15(%1, %%"REG_d"), %%mm3 \n\t" | ||
2314 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2315 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
2316 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2317 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
2318 | "paddw %%mm1, %%mm4 \n\t" | ||
2319 | "paddw %%mm3, %%mm2 \n\t" | ||
2320 | "paddw %%mm2, %%mm4 \n\t" | ||
2321 | "movd 18(%0, %%"REG_d"), %%mm5 \n\t" | ||
2322 | "movd 18(%1, %%"REG_d"), %%mm1 \n\t" | ||
2323 | "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | ||
2324 | "movd 21(%1, %%"REG_d"), %%mm3 \n\t" | ||
2325 | "punpcklbw %%mm7, %%mm5 \n\t" | ||
2326 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
2327 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2328 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
2329 | "paddw %%mm1, %%mm5 \n\t" | ||
2330 | "paddw %%mm3, %%mm2 \n\t" | ||
2331 | "paddw %%mm5, %%mm2 \n\t" | ||
2332 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
2333 | "psrlw $2, %%mm4 \n\t" | ||
2334 | "psrlw $2, %%mm2 \n\t" | ||
2335 | #endif | ||
2336 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" | ||
2337 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | ||
2338 | |||
2339 | "pmaddwd %%mm4, %%mm1 \n\t" | ||
2340 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2341 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
2342 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
2343 | #ifndef FAST_BGR2YV12 | ||
2344 | "psrad $8, %%mm4 \n\t" | ||
2345 | "psrad $8, %%mm1 \n\t" | ||
2346 | "psrad $8, %%mm2 \n\t" | ||
2347 | "psrad $8, %%mm3 \n\t" | ||
2348 | #endif | ||
2349 | "packssdw %%mm2, %%mm4 \n\t" | ||
2350 | "packssdw %%mm3, %%mm1 \n\t" | ||
2351 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
2352 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
2353 | "add $24, %%"REG_d" \n\t" | ||
2354 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | ||
2355 | "psraw $7, %%mm4 \n\t" | ||
2356 | |||
2357 | "movq %%mm0, %%mm1 \n\t" | ||
2358 | "punpckldq %%mm4, %%mm0 \n\t" | ||
2359 | "punpckhdq %%mm4, %%mm1 \n\t" | ||
2360 | "packsswb %%mm1, %%mm0 \n\t" | ||
2361 | "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" | ||
2362 | "movd %%mm0, (%2, %%"REG_a") \n\t" | ||
2363 | "punpckhdq %%mm0, %%mm0 \n\t" | ||
2364 | "movd %%mm0, (%3, %%"REG_a") \n\t" | ||
2365 | "add $4, %%"REG_a" \n\t" | ||
2366 | " js 1b \n\t" | ||
2367 | : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) | ||
2368 | : "%"REG_a, "%"REG_d | ||
2369 | ); | ||
2370 | |||
2371 | udst += chromStride; | ||
2372 | vdst += chromStride; | ||
2373 | src += srcStride*2; | ||
2374 | } | ||
2375 | |||
2376 | asm volatile( EMMS" \n\t" | ||
2377 | SFENCE" \n\t" | ||
2378 | :::"memory"); | ||
2379 | #else | ||
2380 | y=0; | ||
2381 | #endif | ||
2382 | for (; y<height; y+=2) | ||
2383 | { | ||
2384 | long i; | ||
2385 | for (i=0; i<chromWidth; i++) | ||
2386 | { | ||
2387 | unsigned int b = src[6*i+0]; | ||
2388 | unsigned int g = src[6*i+1]; | ||
2389 | unsigned int r = src[6*i+2]; | ||
2390 | |||
2391 | unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | ||
2392 | unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; | ||
2393 | unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; | ||
2394 | |||
2395 | udst[i] = U; | ||
2396 | vdst[i] = V; | ||
2397 | ydst[2*i] = Y; | ||
2398 | |||
2399 | b = src[6*i+3]; | ||
2400 | g = src[6*i+4]; | ||
2401 | r = src[6*i+5]; | ||
2402 | |||
2403 | Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | ||
2404 | ydst[2*i+1] = Y; | ||
2405 | } | ||
2406 | ydst += lumStride; | ||
2407 | src += srcStride; | ||
2408 | |||
2409 | for (i=0; i<chromWidth; i++) | ||
2410 | { | ||
2411 | unsigned int b = src[6*i+0]; | ||
2412 | unsigned int g = src[6*i+1]; | ||
2413 | unsigned int r = src[6*i+2]; | ||
2414 | |||
2415 | unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | ||
2416 | |||
2417 | ydst[2*i] = Y; | ||
2418 | |||
2419 | b = src[6*i+3]; | ||
2420 | g = src[6*i+4]; | ||
2421 | r = src[6*i+5]; | ||
2422 | |||
2423 | Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; | ||
2424 | ydst[2*i+1] = Y; | ||
2425 | } | ||
2426 | udst += chromStride; | ||
2427 | vdst += chromStride; | ||
2428 | ydst += lumStride; | ||
2429 | src += srcStride; | ||
2430 | } | ||
2431 | } | ||
2432 | |||
2433 | void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | ||
2434 | long width, long height, long src1Stride, | ||
2435 | long src2Stride, long dstStride){ | ||
2436 | long h; | ||
2437 | |||
2438 | for (h=0; h < height; h++) | ||
2439 | { | ||
2440 | long w; | ||
2441 | |||
2442 | #ifdef HAVE_MMX | ||
2443 | #ifdef HAVE_SSE2 | ||
2444 | asm( | ||
2445 | "xor %%"REG_a", %%"REG_a" \n\t" | ||
2446 | "1: \n\t" | ||
2447 | PREFETCH" 64(%1, %%"REG_a") \n\t" | ||
2448 | PREFETCH" 64(%2, %%"REG_a") \n\t" | ||
2449 | "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" | ||
2450 | "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" | ||
2451 | "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" | ||
2452 | "punpcklbw %%xmm2, %%xmm0 \n\t" | ||
2453 | "punpckhbw %%xmm2, %%xmm1 \n\t" | ||
2454 | "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" | ||
2455 | "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" | ||
2456 | "add $16, %%"REG_a" \n\t" | ||
2457 | "cmp %3, %%"REG_a" \n\t" | ||
2458 | " jb 1b \n\t" | ||
2459 | ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | ||
2460 | : "memory", "%"REG_a"" | ||
2461 | ); | ||
2462 | #else | ||
2463 | asm( | ||
2464 | "xor %%"REG_a", %%"REG_a" \n\t" | ||
2465 | "1: \n\t" | ||
2466 | PREFETCH" 64(%1, %%"REG_a") \n\t" | ||
2467 | PREFETCH" 64(%2, %%"REG_a") \n\t" | ||
2468 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | ||
2469 | "movq 8(%1, %%"REG_a"), %%mm2 \n\t" | ||
2470 | "movq %%mm0, %%mm1 \n\t" | ||
2471 | "movq %%mm2, %%mm3 \n\t" | ||
2472 | "movq (%2, %%"REG_a"), %%mm4 \n\t" | ||
2473 | "movq 8(%2, %%"REG_a"), %%mm5 \n\t" | ||
2474 | "punpcklbw %%mm4, %%mm0 \n\t" | ||
2475 | "punpckhbw %%mm4, %%mm1 \n\t" | ||
2476 | "punpcklbw %%mm5, %%mm2 \n\t" | ||
2477 | "punpckhbw %%mm5, %%mm3 \n\t" | ||
2478 | MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" | ||
2479 | MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" | ||
2480 | MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" | ||
2481 | MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" | ||
2482 | "add $16, %%"REG_a" \n\t" | ||
2483 | "cmp %3, %%"REG_a" \n\t" | ||
2484 | " jb 1b \n\t" | ||
2485 | ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | ||
2486 | : "memory", "%"REG_a | ||
2487 | ); | ||
2488 | #endif | ||
2489 | for (w= (width&(~15)); w < width; w++) | ||
2490 | { | ||
2491 | dest[2*w+0] = src1[w]; | ||
2492 | dest[2*w+1] = src2[w]; | ||
2493 | } | ||
2494 | #else | ||
2495 | for (w=0; w < width; w++) | ||
2496 | { | ||
2497 | dest[2*w+0] = src1[w]; | ||
2498 | dest[2*w+1] = src2[w]; | ||
2499 | } | ||
2500 | #endif | ||
2501 | dest += dstStride; | ||
2502 | src1 += src1Stride; | ||
2503 | src2 += src2Stride; | ||
2504 | } | ||
2505 | #ifdef HAVE_MMX | ||
2506 | asm( | ||
2507 | EMMS" \n\t" | ||
2508 | SFENCE" \n\t" | ||
2509 | ::: "memory" | ||
2510 | ); | ||
2511 | #endif | ||
2512 | } | ||
2513 | |||
2514 | static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, | ||
2515 | uint8_t *dst1, uint8_t *dst2, | ||
2516 | long width, long height, | ||
2517 | long srcStride1, long srcStride2, | ||
2518 | long dstStride1, long dstStride2) | ||
2519 | { | ||
2520 | long y,x,w,h; | ||
2521 | w=width/2; h=height/2; | ||
2522 | #ifdef HAVE_MMX | ||
2523 | asm volatile( | ||
2524 | PREFETCH" %0 \n\t" | ||
2525 | PREFETCH" %1 \n\t" | ||
2526 | ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); | ||
2527 | #endif | ||
2528 | for (y=0;y<h;y++){ | ||
2529 | const uint8_t* s1=src1+srcStride1*(y>>1); | ||
2530 | uint8_t* d=dst1+dstStride1*y; | ||
2531 | x=0; | ||
2532 | #ifdef HAVE_MMX | ||
2533 | for (;x<w-31;x+=32) | ||
2534 | { | ||
2535 | asm volatile( | ||
2536 | PREFETCH" 32%1 \n\t" | ||
2537 | "movq %1, %%mm0 \n\t" | ||
2538 | "movq 8%1, %%mm2 \n\t" | ||
2539 | "movq 16%1, %%mm4 \n\t" | ||
2540 | "movq 24%1, %%mm6 \n\t" | ||
2541 | "movq %%mm0, %%mm1 \n\t" | ||
2542 | "movq %%mm2, %%mm3 \n\t" | ||
2543 | "movq %%mm4, %%mm5 \n\t" | ||
2544 | "movq %%mm6, %%mm7 \n\t" | ||
2545 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
2546 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
2547 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
2548 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
2549 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
2550 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
2551 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
2552 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
2553 | MOVNTQ" %%mm0, %0 \n\t" | ||
2554 | MOVNTQ" %%mm1, 8%0 \n\t" | ||
2555 | MOVNTQ" %%mm2, 16%0 \n\t" | ||
2556 | MOVNTQ" %%mm3, 24%0 \n\t" | ||
2557 | MOVNTQ" %%mm4, 32%0 \n\t" | ||
2558 | MOVNTQ" %%mm5, 40%0 \n\t" | ||
2559 | MOVNTQ" %%mm6, 48%0 \n\t" | ||
2560 | MOVNTQ" %%mm7, 56%0" | ||
2561 | :"=m"(d[2*x]) | ||
2562 | :"m"(s1[x]) | ||
2563 | :"memory"); | ||
2564 | } | ||
2565 | #endif | ||
2566 | for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; | ||
2567 | } | ||
2568 | for (y=0;y<h;y++){ | ||
2569 | const uint8_t* s2=src2+srcStride2*(y>>1); | ||
2570 | uint8_t* d=dst2+dstStride2*y; | ||
2571 | x=0; | ||
2572 | #ifdef HAVE_MMX | ||
2573 | for (;x<w-31;x+=32) | ||
2574 | { | ||
2575 | asm volatile( | ||
2576 | PREFETCH" 32%1 \n\t" | ||
2577 | "movq %1, %%mm0 \n\t" | ||
2578 | "movq 8%1, %%mm2 \n\t" | ||
2579 | "movq 16%1, %%mm4 \n\t" | ||
2580 | "movq 24%1, %%mm6 \n\t" | ||
2581 | "movq %%mm0, %%mm1 \n\t" | ||
2582 | "movq %%mm2, %%mm3 \n\t" | ||
2583 | "movq %%mm4, %%mm5 \n\t" | ||
2584 | "movq %%mm6, %%mm7 \n\t" | ||
2585 | "punpcklbw %%mm0, %%mm0 \n\t" | ||
2586 | "punpckhbw %%mm1, %%mm1 \n\t" | ||
2587 | "punpcklbw %%mm2, %%mm2 \n\t" | ||
2588 | "punpckhbw %%mm3, %%mm3 \n\t" | ||
2589 | "punpcklbw %%mm4, %%mm4 \n\t" | ||
2590 | "punpckhbw %%mm5, %%mm5 \n\t" | ||
2591 | "punpcklbw %%mm6, %%mm6 \n\t" | ||
2592 | "punpckhbw %%mm7, %%mm7 \n\t" | ||
2593 | MOVNTQ" %%mm0, %0 \n\t" | ||
2594 | MOVNTQ" %%mm1, 8%0 \n\t" | ||
2595 | MOVNTQ" %%mm2, 16%0 \n\t" | ||
2596 | MOVNTQ" %%mm3, 24%0 \n\t" | ||
2597 | MOVNTQ" %%mm4, 32%0 \n\t" | ||
2598 | MOVNTQ" %%mm5, 40%0 \n\t" | ||
2599 | MOVNTQ" %%mm6, 48%0 \n\t" | ||
2600 | MOVNTQ" %%mm7, 56%0" | ||
2601 | :"=m"(d[2*x]) | ||
2602 | :"m"(s2[x]) | ||
2603 | :"memory"); | ||
2604 | } | ||
2605 | #endif | ||
2606 | for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; | ||
2607 | } | ||
2608 | #ifdef HAVE_MMX | ||
2609 | asm( | ||
2610 | EMMS" \n\t" | ||
2611 | SFENCE" \n\t" | ||
2612 | ::: "memory" | ||
2613 | ); | ||
2614 | #endif | ||
2615 | } | ||
2616 | |||
2617 | static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, | ||
2618 | uint8_t *dst, | ||
2619 | long width, long height, | ||
2620 | long srcStride1, long srcStride2, | ||
2621 | long srcStride3, long dstStride) | ||
2622 | { | ||
2623 | long y,x,w,h; | ||
2624 | w=width/2; h=height; | ||
2625 | for (y=0;y<h;y++){ | ||
2626 | const uint8_t* yp=src1+srcStride1*y; | ||
2627 | const uint8_t* up=src2+srcStride2*(y>>2); | ||
2628 | const uint8_t* vp=src3+srcStride3*(y>>2); | ||
2629 | uint8_t* d=dst+dstStride*y; | ||
2630 | x=0; | ||
2631 | #ifdef HAVE_MMX | ||
2632 | for (;x<w-7;x+=8) | ||
2633 | { | ||
2634 | asm volatile( | ||
2635 | PREFETCH" 32(%1, %0) \n\t" | ||
2636 | PREFETCH" 32(%2, %0) \n\t" | ||
2637 | PREFETCH" 32(%3, %0) \n\t" | ||
2638 | "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
2639 | "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
2640 | "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
2641 | "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ | ||
2642 | "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ | ||
2643 | "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ | ||
2644 | "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ | ||
2645 | "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ | ||
2646 | "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ | ||
2647 | "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ | ||
2648 | |||
2649 | "movq %%mm1, %%mm6 \n\t" | ||
2650 | "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ | ||
2651 | "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ | ||
2652 | "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ | ||
2653 | MOVNTQ" %%mm0, (%4, %0, 8) \n\t" | ||
2654 | MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" | ||
2655 | |||
2656 | "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ | ||
2657 | "movq 8(%1, %0, 4), %%mm0 \n\t" | ||
2658 | "movq %%mm0, %%mm3 \n\t" | ||
2659 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ | ||
2660 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ | ||
2661 | MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" | ||
2662 | MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" | ||
2663 | |||
2664 | "movq %%mm4, %%mm6 \n\t" | ||
2665 | "movq 16(%1, %0, 4), %%mm0 \n\t" | ||
2666 | "movq %%mm0, %%mm3 \n\t" | ||
2667 | "punpcklbw %%mm5, %%mm4 \n\t" | ||
2668 | "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ | ||
2669 | "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ | ||
2670 | MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" | ||
2671 | MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" | ||
2672 | |||
2673 | "punpckhbw %%mm5, %%mm6 \n\t" | ||
2674 | "movq 24(%1, %0, 4), %%mm0 \n\t" | ||
2675 | "movq %%mm0, %%mm3 \n\t" | ||
2676 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ | ||
2677 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ | ||
2678 | MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" | ||
2679 | MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" | ||
2680 | |||
2681 | : "+r" (x) | ||
2682 | : "r"(yp), "r" (up), "r"(vp), "r"(d) | ||
2683 | :"memory"); | ||
2684 | } | ||
2685 | #endif | ||
2686 | for (; x<w; x++) | ||
2687 | { | ||
2688 | const long x2 = x<<2; | ||
2689 | d[8*x+0] = yp[x2]; | ||
2690 | d[8*x+1] = up[x]; | ||
2691 | d[8*x+2] = yp[x2+1]; | ||
2692 | d[8*x+3] = vp[x]; | ||
2693 | d[8*x+4] = yp[x2+2]; | ||
2694 | d[8*x+5] = up[x]; | ||
2695 | d[8*x+6] = yp[x2+3]; | ||
2696 | d[8*x+7] = vp[x]; | ||
2697 | } | ||
2698 | } | ||
2699 | #ifdef HAVE_MMX | ||
2700 | asm( | ||
2701 | EMMS" \n\t" | ||
2702 | SFENCE" \n\t" | ||
2703 | ::: "memory" | ||
2704 | ); | ||
2705 | #endif | ||
2706 | } | ||
2707 | |||
2708 | static inline void RENAME(rgb2rgb_init)(void){ | ||
2709 | rgb15to16 = RENAME(rgb15to16); | ||
2710 | rgb15to24 = RENAME(rgb15to24); | ||
2711 | rgb15to32 = RENAME(rgb15to32); | ||
2712 | rgb16to24 = RENAME(rgb16to24); | ||
2713 | rgb16to32 = RENAME(rgb16to32); | ||
2714 | rgb16to15 = RENAME(rgb16to15); | ||
2715 | rgb24to16 = RENAME(rgb24to16); | ||
2716 | rgb24to15 = RENAME(rgb24to15); | ||
2717 | rgb24to32 = RENAME(rgb24to32); | ||
2718 | rgb32to16 = RENAME(rgb32to16); | ||
2719 | rgb32to15 = RENAME(rgb32to15); | ||
2720 | rgb32to24 = RENAME(rgb32to24); | ||
2721 | rgb24tobgr15 = RENAME(rgb24tobgr15); | ||
2722 | rgb24tobgr16 = RENAME(rgb24tobgr16); | ||
2723 | rgb24tobgr24 = RENAME(rgb24tobgr24); | ||
2724 | rgb32tobgr32 = RENAME(rgb32tobgr32); | ||
2725 | rgb32tobgr16 = RENAME(rgb32tobgr16); | ||
2726 | rgb32tobgr15 = RENAME(rgb32tobgr15); | ||
2727 | yv12toyuy2 = RENAME(yv12toyuy2); | ||
2728 | yv12touyvy = RENAME(yv12touyvy); | ||
2729 | yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); | ||
2730 | yuy2toyv12 = RENAME(yuy2toyv12); | ||
2731 | // uyvytoyv12 = RENAME(uyvytoyv12); | ||
2732 | // yvu9toyv12 = RENAME(yvu9toyv12); | ||
2733 | planar2x = RENAME(planar2x); | ||
2734 | rgb24toyv12 = RENAME(rgb24toyv12); | ||
2735 | interleaveBytes = RENAME(interleaveBytes); | ||
2736 | vu9_to_vu12 = RENAME(vu9_to_vu12); | ||
2737 | yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); | ||
2738 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale-example.c b/src/plugins/ffmpeg/libswscale/swscale-example.c deleted file mode 100644 index bc2a8bf..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale-example.c +++ /dev/null | |||
@@ -1,230 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2.1 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | */ | ||
20 | |||
21 | #include <stdio.h> | ||
22 | #include <stdlib.h> | ||
23 | #include <string.h> | ||
24 | #include <inttypes.h> | ||
25 | #include <stdarg.h> | ||
26 | |||
27 | #undef HAVE_AV_CONFIG_H | ||
28 | #include "libavutil/avutil.h" | ||
29 | #include "swscale.h" | ||
30 | #include "swscale_internal.h" | ||
31 | #include "rgb2rgb.h" | ||
32 | |||
33 | static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){ | ||
34 | int x,y; | ||
35 | uint64_t ssd=0; | ||
36 | |||
37 | //printf("%d %d\n", w, h); | ||
38 | |||
39 | for (y=0; y<h; y++){ | ||
40 | for (x=0; x<w; x++){ | ||
41 | int d= src1[x + y*stride1] - src2[x + y*stride2]; | ||
42 | ssd+= d*d; | ||
43 | //printf("%d", abs(src1[x + y*stride1] - src2[x + y*stride2])/26 ); | ||
44 | } | ||
45 | //printf("\n"); | ||
46 | } | ||
47 | return ssd; | ||
48 | } | ||
49 | |||
50 | // test by ref -> src -> dst -> out & compare out against ref | ||
51 | // ref & out are YV12 | ||
52 | static int doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat, | ||
53 | int srcW, int srcH, int dstW, int dstH, int flags){ | ||
54 | uint8_t *src[3]; | ||
55 | uint8_t *dst[3]; | ||
56 | uint8_t *out[3]; | ||
57 | int srcStride[3], dstStride[3]; | ||
58 | int i; | ||
59 | uint64_t ssdY, ssdU, ssdV; | ||
60 | struct SwsContext *srcContext, *dstContext, *outContext; | ||
61 | int res; | ||
62 | |||
63 | res = 0; | ||
64 | for (i=0; i<3; i++){ | ||
65 | // avoid stride % bpp != 0 | ||
66 | if (srcFormat==PIX_FMT_RGB24 || srcFormat==PIX_FMT_BGR24) | ||
67 | srcStride[i]= srcW*3; | ||
68 | else | ||
69 | srcStride[i]= srcW*4; | ||
70 | |||
71 | if (dstFormat==PIX_FMT_RGB24 || dstFormat==PIX_FMT_BGR24) | ||
72 | dstStride[i]= dstW*3; | ||
73 | else | ||
74 | dstStride[i]= dstW*4; | ||
75 | |||
76 | src[i]= (uint8_t*) malloc(srcStride[i]*srcH); | ||
77 | dst[i]= (uint8_t*) malloc(dstStride[i]*dstH); | ||
78 | out[i]= (uint8_t*) malloc(refStride[i]*h); | ||
79 | if (!src[i] || !dst[i] || !out[i]) { | ||
80 | perror("Malloc"); | ||
81 | res = -1; | ||
82 | |||
83 | goto end; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | dstContext = outContext = NULL; | ||
88 | srcContext= sws_getContext(w, h, PIX_FMT_YUV420P, srcW, srcH, srcFormat, flags, NULL, NULL, NULL); | ||
89 | if (!srcContext) { | ||
90 | fprintf(stderr, "Failed to get %s ---> %s\n", | ||
91 | sws_format_name(PIX_FMT_YUV420P), | ||
92 | sws_format_name(srcFormat)); | ||
93 | res = -1; | ||
94 | |||
95 | goto end; | ||
96 | } | ||
97 | dstContext= sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL, NULL); | ||
98 | if (!dstContext) { | ||
99 | fprintf(stderr, "Failed to get %s ---> %s\n", | ||
100 | sws_format_name(srcFormat), | ||
101 | sws_format_name(dstFormat)); | ||
102 | res = -1; | ||
103 | |||
104 | goto end; | ||
105 | } | ||
106 | outContext= sws_getContext(dstW, dstH, dstFormat, w, h, PIX_FMT_YUV420P, flags, NULL, NULL, NULL); | ||
107 | if (!outContext) { | ||
108 | fprintf(stderr, "Failed to get %s ---> %s\n", | ||
109 | sws_format_name(dstFormat), | ||
110 | sws_format_name(PIX_FMT_YUV420P)); | ||
111 | res = -1; | ||
112 | |||
113 | goto end; | ||
114 | } | ||
115 | // printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2], | ||
116 | // (int)src[0], (int)src[1], (int)src[2]); | ||
117 | |||
118 | sws_scale(srcContext, ref, refStride, 0, h , src, srcStride); | ||
119 | sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride); | ||
120 | sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride); | ||
121 | |||
122 | #if defined(ARCH_X86) | ||
123 | asm volatile ("emms\n\t"); | ||
124 | #endif | ||
125 | |||
126 | ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h); | ||
127 | ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1); | ||
128 | ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1); | ||
129 | |||
130 | if (srcFormat == PIX_FMT_GRAY8 || dstFormat==PIX_FMT_GRAY8) ssdU=ssdV=0; //FIXME check that output is really gray | ||
131 | |||
132 | ssdY/= w*h; | ||
133 | ssdU/= w*h/4; | ||
134 | ssdV/= w*h/4; | ||
135 | |||
136 | if (ssdY>100 || ssdU>100 || ssdV>100){ | ||
137 | printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n", | ||
138 | sws_format_name(srcFormat), srcW, srcH, | ||
139 | sws_format_name(dstFormat), dstW, dstH, | ||
140 | flags, | ||
141 | ssdY, ssdU, ssdV); | ||
142 | } | ||
143 | |||
144 | end: | ||
145 | |||
146 | sws_freeContext(srcContext); | ||
147 | sws_freeContext(dstContext); | ||
148 | sws_freeContext(outContext); | ||
149 | |||
150 | for (i=0; i<3; i++){ | ||
151 | free(src[i]); | ||
152 | free(dst[i]); | ||
153 | free(out[i]); | ||
154 | } | ||
155 | |||
156 | return res; | ||
157 | } | ||
158 | |||
159 | void fast_memcpy(void *a, void *b, int s){ //FIXME | ||
160 | memcpy(a, b, s); | ||
161 | } | ||
162 | |||
163 | static void selfTest(uint8_t *src[3], int stride[3], int w, int h){ | ||
164 | enum PixelFormat srcFormat, dstFormat; | ||
165 | int srcW, srcH, dstW, dstH; | ||
166 | int flags; | ||
167 | |||
168 | for (srcFormat = 0; srcFormat < PIX_FMT_NB; srcFormat++) { | ||
169 | for (dstFormat = 0; dstFormat < PIX_FMT_NB; dstFormat++) { | ||
170 | printf("%s -> %s\n", | ||
171 | sws_format_name(srcFormat), | ||
172 | sws_format_name(dstFormat)); | ||
173 | |||
174 | srcW= w; | ||
175 | srcH= h; | ||
176 | for (dstW=w - w/3; dstW<= 4*w/3; dstW+= w/3){ | ||
177 | for (dstH=h - h/3; dstH<= 4*h/3; dstH+= h/3){ | ||
178 | for (flags=1; flags<33; flags*=2) { | ||
179 | int res; | ||
180 | |||
181 | res = doTest(src, stride, w, h, srcFormat, dstFormat, | ||
182 | srcW, srcH, dstW, dstH, flags); | ||
183 | if (res < 0) { | ||
184 | dstW = 4 * w / 3; | ||
185 | dstH = 4 * h / 3; | ||
186 | flags = 33; | ||
187 | } | ||
188 | } | ||
189 | } | ||
190 | } | ||
191 | } | ||
192 | } | ||
193 | } | ||
194 | |||
195 | #define W 96 | ||
196 | #define H 96 | ||
197 | |||
198 | int main(int argc, char **argv){ | ||
199 | uint8_t *rgb_data = malloc (W*H*4); | ||
200 | uint8_t *rgb_src[3]= {rgb_data, NULL, NULL}; | ||
201 | int rgb_stride[3]={4*W, 0, 0}; | ||
202 | uint8_t *data = malloc (3*W*H); | ||
203 | uint8_t *src[3]= {data, data+W*H, data+W*H*2}; | ||
204 | int stride[3]={W, W, W}; | ||
205 | int x, y; | ||
206 | struct SwsContext *sws; | ||
207 | |||
208 | sws= sws_getContext(W/12, H/12, PIX_FMT_RGB32, W, H, PIX_FMT_YUV420P, 2, NULL, NULL, NULL); | ||
209 | |||
210 | for (y=0; y<H; y++){ | ||
211 | for (x=0; x<W*4; x++){ | ||
212 | rgb_data[ x + y*4*W]= random(); | ||
213 | } | ||
214 | } | ||
215 | #if defined(ARCH_X86) | ||
216 | sws_rgb2rgb_init(SWS_CPU_CAPS_MMX*0); | ||
217 | #else | ||
218 | sws_rgb2rgb_init(0); | ||
219 | #endif | ||
220 | sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride); | ||
221 | |||
222 | #if defined(ARCH_X86) | ||
223 | asm volatile ("emms\n\t"); | ||
224 | #endif | ||
225 | |||
226 | selfTest(src, stride, W, H); | ||
227 | free (rgb_data); | ||
228 | free (data); | ||
229 | return 123; | ||
230 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale.c b/src/plugins/ffmpeg/libswscale/swscale.c deleted file mode 100644 index f6c2f76..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale.c +++ /dev/null | |||
@@ -1,2934 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | * | ||
20 | * the C code (not assembly, mmx, ...) of this file can be used | ||
21 | * under the LGPL license too | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09, PAL8 | ||
26 | supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09 | ||
27 | {BGR,RGB}{1,4,8,15,16} support dithering | ||
28 | |||
29 | unscaled special converters (YV12=I420=IYUV, Y800=Y8) | ||
30 | YV12 -> {BGR,RGB}{1,4,8,15,16,24,32} | ||
31 | x -> x | ||
32 | YUV9 -> YV12 | ||
33 | YUV9/YV12 -> Y800 | ||
34 | Y800 -> YUV9/YV12 | ||
35 | BGR24 -> BGR32 & RGB24 -> RGB32 | ||
36 | BGR32 -> BGR24 & RGB32 -> RGB24 | ||
37 | BGR15 -> BGR16 | ||
38 | */ | ||
39 | |||
40 | /* | ||
41 | tested special converters (most are tested actually, but I did not write it down ...) | ||
42 | YV12 -> BGR16 | ||
43 | YV12 -> YV12 | ||
44 | BGR15 -> BGR16 | ||
45 | BGR16 -> BGR16 | ||
46 | YVU9 -> YV12 | ||
47 | |||
48 | untested special converters | ||
49 | YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be ok) | ||
50 | YV12/I420 -> YV12/I420 | ||
51 | YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format | ||
52 | BGR24 -> BGR32 & RGB24 -> RGB32 | ||
53 | BGR32 -> BGR24 & RGB32 -> RGB24 | ||
54 | BGR24 -> YV12 | ||
55 | */ | ||
56 | |||
57 | #include <inttypes.h> | ||
58 | #include <string.h> | ||
59 | #include <math.h> | ||
60 | #include <stdio.h> | ||
61 | #include <unistd.h> | ||
62 | #include "config.h" | ||
63 | #include <assert.h> | ||
64 | #ifdef HAVE_SYS_MMAN_H | ||
65 | #include <sys/mman.h> | ||
66 | #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS) | ||
67 | #define MAP_ANONYMOUS MAP_ANON | ||
68 | #endif | ||
69 | #endif | ||
70 | #include "swscale.h" | ||
71 | #include "swscale_internal.h" | ||
72 | #include "rgb2rgb.h" | ||
73 | #include "libavutil/x86_cpu.h" | ||
74 | #include "libavutil/bswap.h" | ||
75 | |||
76 | #undef MOVNTQ | ||
77 | #undef PAVGB | ||
78 | |||
79 | //#undef HAVE_MMX2 | ||
80 | //#define HAVE_3DNOW | ||
81 | //#undef HAVE_MMX | ||
82 | //#undef ARCH_X86 | ||
83 | //#define WORDS_BIGENDIAN | ||
84 | #define DITHER1XBPP | ||
85 | |||
86 | #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit | ||
87 | |||
88 | #define RET 0xC3 //near return opcode for X86 | ||
89 | |||
90 | #ifdef M_PI | ||
91 | #define PI M_PI | ||
92 | #else | ||
93 | #define PI 3.14159265358979323846 | ||
94 | #endif | ||
95 | |||
96 | #define isSupportedIn(x) ( \ | ||
97 | (x)==PIX_FMT_YUV420P \ | ||
98 | || (x)==PIX_FMT_YUVA420P \ | ||
99 | || (x)==PIX_FMT_YUYV422 \ | ||
100 | || (x)==PIX_FMT_UYVY422 \ | ||
101 | || (x)==PIX_FMT_RGB32 \ | ||
102 | || (x)==PIX_FMT_BGR24 \ | ||
103 | || (x)==PIX_FMT_BGR565 \ | ||
104 | || (x)==PIX_FMT_BGR555 \ | ||
105 | || (x)==PIX_FMT_BGR32 \ | ||
106 | || (x)==PIX_FMT_RGB24 \ | ||
107 | || (x)==PIX_FMT_RGB565 \ | ||
108 | || (x)==PIX_FMT_RGB555 \ | ||
109 | || (x)==PIX_FMT_GRAY8 \ | ||
110 | || (x)==PIX_FMT_YUV410P \ | ||
111 | || (x)==PIX_FMT_GRAY16BE \ | ||
112 | || (x)==PIX_FMT_GRAY16LE \ | ||
113 | || (x)==PIX_FMT_YUV444P \ | ||
114 | || (x)==PIX_FMT_YUV422P \ | ||
115 | || (x)==PIX_FMT_YUV411P \ | ||
116 | || (x)==PIX_FMT_PAL8 \ | ||
117 | || (x)==PIX_FMT_BGR8 \ | ||
118 | || (x)==PIX_FMT_RGB8 \ | ||
119 | || (x)==PIX_FMT_BGR4_BYTE \ | ||
120 | || (x)==PIX_FMT_RGB4_BYTE \ | ||
121 | || (x)==PIX_FMT_YUV440P \ | ||
122 | ) | ||
123 | #define isSupportedOut(x) ( \ | ||
124 | (x)==PIX_FMT_YUV420P \ | ||
125 | || (x)==PIX_FMT_YUYV422 \ | ||
126 | || (x)==PIX_FMT_UYVY422 \ | ||
127 | || (x)==PIX_FMT_YUV444P \ | ||
128 | || (x)==PIX_FMT_YUV422P \ | ||
129 | || (x)==PIX_FMT_YUV411P \ | ||
130 | || isRGB(x) \ | ||
131 | || isBGR(x) \ | ||
132 | || (x)==PIX_FMT_NV12 \ | ||
133 | || (x)==PIX_FMT_NV21 \ | ||
134 | || (x)==PIX_FMT_GRAY16BE \ | ||
135 | || (x)==PIX_FMT_GRAY16LE \ | ||
136 | || (x)==PIX_FMT_GRAY8 \ | ||
137 | || (x)==PIX_FMT_YUV410P \ | ||
138 | ) | ||
139 | #define isPacked(x) ( \ | ||
140 | (x)==PIX_FMT_PAL8 \ | ||
141 | || (x)==PIX_FMT_YUYV422 \ | ||
142 | || (x)==PIX_FMT_UYVY422 \ | ||
143 | || isRGB(x) \ | ||
144 | || isBGR(x) \ | ||
145 | ) | ||
146 | |||
147 | #define RGB2YUV_SHIFT 16 | ||
148 | #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) | ||
149 | #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) | ||
150 | #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
151 | #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) | ||
152 | #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) | ||
153 | #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) | ||
154 | #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) | ||
155 | #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) | ||
156 | #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) | ||
157 | |||
158 | extern const int32_t Inverse_Table_6_9[8][4]; | ||
159 | |||
160 | /* | ||
161 | NOTES | ||
162 | Special versions: fast Y 1:1 scaling (no interpolation in y direction) | ||
163 | |||
164 | TODO | ||
165 | more intelligent misalignment avoidance for the horizontal scaler | ||
166 | write special vertical cubic upscale version | ||
167 | Optimize C code (yv12 / minmax) | ||
168 | add support for packed pixel yuv input & output | ||
169 | add support for Y8 output | ||
170 | optimize bgr24 & bgr32 | ||
171 | add BGR4 output support | ||
172 | write special BGR->BGR scaler | ||
173 | */ | ||
174 | |||
175 | #if defined(ARCH_X86) && defined (CONFIG_GPL) | ||
176 | DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL; | ||
177 | DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL; | ||
178 | DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL; | ||
179 | DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL; | ||
180 | DECLARE_ASM_CONST(8, uint64_t, bm00001111)=0x00000000FFFFFFFFLL; | ||
181 | DECLARE_ASM_CONST(8, uint64_t, bm00000111)=0x0000000000FFFFFFLL; | ||
182 | DECLARE_ASM_CONST(8, uint64_t, bm11111000)=0xFFFFFFFFFF000000LL; | ||
183 | DECLARE_ASM_CONST(8, uint64_t, bm01010101)=0x00FF00FF00FF00FFLL; | ||
184 | |||
185 | static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither; | ||
186 | static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither; | ||
187 | static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; | ||
188 | static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither; | ||
189 | |||
190 | const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]) = { | ||
191 | 0x0103010301030103LL, | ||
192 | 0x0200020002000200LL,}; | ||
193 | |||
194 | const DECLARE_ALIGNED(8, uint64_t, ff_dither8[2]) = { | ||
195 | 0x0602060206020602LL, | ||
196 | 0x0004000400040004LL,}; | ||
197 | |||
198 | DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL; | ||
199 | DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL; | ||
200 | DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL; | ||
201 | DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL; | ||
202 | DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL; | ||
203 | DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL; | ||
204 | |||
205 | DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL; | ||
206 | DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL; | ||
207 | DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL; | ||
208 | |||
209 | #ifdef FAST_BGR2YV12 | ||
210 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000000210041000DULL; | ||
211 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000FFEEFFDC0038ULL; | ||
212 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00000038FFD2FFF8ULL; | ||
213 | #else | ||
214 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL; | ||
215 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL; | ||
216 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL; | ||
217 | #endif /* FAST_BGR2YV12 */ | ||
218 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL; | ||
219 | DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; | ||
220 | DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL; | ||
221 | #endif /* defined(ARCH_X86) */ | ||
222 | |||
223 | // clipping helper table for C implementations: | ||
224 | static unsigned char clip_table[768]; | ||
225 | |||
226 | static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b); | ||
227 | |||
228 | extern const uint8_t dither_2x2_4[2][8]; | ||
229 | extern const uint8_t dither_2x2_8[2][8]; | ||
230 | extern const uint8_t dither_8x8_32[8][8]; | ||
231 | extern const uint8_t dither_8x8_73[8][8]; | ||
232 | extern const uint8_t dither_8x8_220[8][8]; | ||
233 | |||
234 | const char *sws_format_name(enum PixelFormat format) | ||
235 | { | ||
236 | switch (format) { | ||
237 | case PIX_FMT_YUV420P: | ||
238 | return "yuv420p"; | ||
239 | case PIX_FMT_YUVA420P: | ||
240 | return "yuva420p"; | ||
241 | case PIX_FMT_YUYV422: | ||
242 | return "yuyv422"; | ||
243 | case PIX_FMT_RGB24: | ||
244 | return "rgb24"; | ||
245 | case PIX_FMT_BGR24: | ||
246 | return "bgr24"; | ||
247 | case PIX_FMT_YUV422P: | ||
248 | return "yuv422p"; | ||
249 | case PIX_FMT_YUV444P: | ||
250 | return "yuv444p"; | ||
251 | case PIX_FMT_RGB32: | ||
252 | return "rgb32"; | ||
253 | case PIX_FMT_YUV410P: | ||
254 | return "yuv410p"; | ||
255 | case PIX_FMT_YUV411P: | ||
256 | return "yuv411p"; | ||
257 | case PIX_FMT_RGB565: | ||
258 | return "rgb565"; | ||
259 | case PIX_FMT_RGB555: | ||
260 | return "rgb555"; | ||
261 | case PIX_FMT_GRAY16BE: | ||
262 | return "gray16be"; | ||
263 | case PIX_FMT_GRAY16LE: | ||
264 | return "gray16le"; | ||
265 | case PIX_FMT_GRAY8: | ||
266 | return "gray8"; | ||
267 | case PIX_FMT_MONOWHITE: | ||
268 | return "mono white"; | ||
269 | case PIX_FMT_MONOBLACK: | ||
270 | return "mono black"; | ||
271 | case PIX_FMT_PAL8: | ||
272 | return "Palette"; | ||
273 | case PIX_FMT_YUVJ420P: | ||
274 | return "yuvj420p"; | ||
275 | case PIX_FMT_YUVJ422P: | ||
276 | return "yuvj422p"; | ||
277 | case PIX_FMT_YUVJ444P: | ||
278 | return "yuvj444p"; | ||
279 | case PIX_FMT_XVMC_MPEG2_MC: | ||
280 | return "xvmc_mpeg2_mc"; | ||
281 | case PIX_FMT_XVMC_MPEG2_IDCT: | ||
282 | return "xvmc_mpeg2_idct"; | ||
283 | case PIX_FMT_UYVY422: | ||
284 | return "uyvy422"; | ||
285 | case PIX_FMT_UYYVYY411: | ||
286 | return "uyyvyy411"; | ||
287 | case PIX_FMT_RGB32_1: | ||
288 | return "rgb32x"; | ||
289 | case PIX_FMT_BGR32_1: | ||
290 | return "bgr32x"; | ||
291 | case PIX_FMT_BGR32: | ||
292 | return "bgr32"; | ||
293 | case PIX_FMT_BGR565: | ||
294 | return "bgr565"; | ||
295 | case PIX_FMT_BGR555: | ||
296 | return "bgr555"; | ||
297 | case PIX_FMT_BGR8: | ||
298 | return "bgr8"; | ||
299 | case PIX_FMT_BGR4: | ||
300 | return "bgr4"; | ||
301 | case PIX_FMT_BGR4_BYTE: | ||
302 | return "bgr4 byte"; | ||
303 | case PIX_FMT_RGB8: | ||
304 | return "rgb8"; | ||
305 | case PIX_FMT_RGB4: | ||
306 | return "rgb4"; | ||
307 | case PIX_FMT_RGB4_BYTE: | ||
308 | return "rgb4 byte"; | ||
309 | case PIX_FMT_NV12: | ||
310 | return "nv12"; | ||
311 | case PIX_FMT_NV21: | ||
312 | return "nv21"; | ||
313 | case PIX_FMT_YUV440P: | ||
314 | return "yuv440p"; | ||
315 | default: | ||
316 | return "Unknown format"; | ||
317 | } | ||
318 | } | ||
319 | |||
320 | static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
321 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
322 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) | ||
323 | { | ||
324 | //FIXME Optimize (just quickly writen not opti..) | ||
325 | int i; | ||
326 | for (i=0; i<dstW; i++) | ||
327 | { | ||
328 | int val=1<<18; | ||
329 | int j; | ||
330 | for (j=0; j<lumFilterSize; j++) | ||
331 | val += lumSrc[j][i] * lumFilter[j]; | ||
332 | |||
333 | dest[i]= av_clip_uint8(val>>19); | ||
334 | } | ||
335 | |||
336 | if (uDest) | ||
337 | for (i=0; i<chrDstW; i++) | ||
338 | { | ||
339 | int u=1<<18; | ||
340 | int v=1<<18; | ||
341 | int j; | ||
342 | for (j=0; j<chrFilterSize; j++) | ||
343 | { | ||
344 | u += chrSrc[j][i] * chrFilter[j]; | ||
345 | v += chrSrc[j][i + VOFW] * chrFilter[j]; | ||
346 | } | ||
347 | |||
348 | uDest[i]= av_clip_uint8(u>>19); | ||
349 | vDest[i]= av_clip_uint8(v>>19); | ||
350 | } | ||
351 | } | ||
352 | |||
353 | static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
354 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
355 | uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | ||
356 | { | ||
357 | //FIXME Optimize (just quickly writen not opti..) | ||
358 | int i; | ||
359 | for (i=0; i<dstW; i++) | ||
360 | { | ||
361 | int val=1<<18; | ||
362 | int j; | ||
363 | for (j=0; j<lumFilterSize; j++) | ||
364 | val += lumSrc[j][i] * lumFilter[j]; | ||
365 | |||
366 | dest[i]= av_clip_uint8(val>>19); | ||
367 | } | ||
368 | |||
369 | if (!uDest) | ||
370 | return; | ||
371 | |||
372 | if (dstFormat == PIX_FMT_NV12) | ||
373 | for (i=0; i<chrDstW; i++) | ||
374 | { | ||
375 | int u=1<<18; | ||
376 | int v=1<<18; | ||
377 | int j; | ||
378 | for (j=0; j<chrFilterSize; j++) | ||
379 | { | ||
380 | u += chrSrc[j][i] * chrFilter[j]; | ||
381 | v += chrSrc[j][i + VOFW] * chrFilter[j]; | ||
382 | } | ||
383 | |||
384 | uDest[2*i]= av_clip_uint8(u>>19); | ||
385 | uDest[2*i+1]= av_clip_uint8(v>>19); | ||
386 | } | ||
387 | else | ||
388 | for (i=0; i<chrDstW; i++) | ||
389 | { | ||
390 | int u=1<<18; | ||
391 | int v=1<<18; | ||
392 | int j; | ||
393 | for (j=0; j<chrFilterSize; j++) | ||
394 | { | ||
395 | u += chrSrc[j][i] * chrFilter[j]; | ||
396 | v += chrSrc[j][i + VOFW] * chrFilter[j]; | ||
397 | } | ||
398 | |||
399 | uDest[2*i]= av_clip_uint8(v>>19); | ||
400 | uDest[2*i+1]= av_clip_uint8(u>>19); | ||
401 | } | ||
402 | } | ||
403 | |||
404 | #define YSCALE_YUV_2_PACKEDX_C(type) \ | ||
405 | for (i=0; i<(dstW>>1); i++){\ | ||
406 | int j;\ | ||
407 | int Y1 = 1<<18;\ | ||
408 | int Y2 = 1<<18;\ | ||
409 | int U = 1<<18;\ | ||
410 | int V = 1<<18;\ | ||
411 | type av_unused *r, *b, *g;\ | ||
412 | const int i2= 2*i;\ | ||
413 | \ | ||
414 | for (j=0; j<lumFilterSize; j++)\ | ||
415 | {\ | ||
416 | Y1 += lumSrc[j][i2] * lumFilter[j];\ | ||
417 | Y2 += lumSrc[j][i2+1] * lumFilter[j];\ | ||
418 | }\ | ||
419 | for (j=0; j<chrFilterSize; j++)\ | ||
420 | {\ | ||
421 | U += chrSrc[j][i] * chrFilter[j];\ | ||
422 | V += chrSrc[j][i+VOFW] * chrFilter[j];\ | ||
423 | }\ | ||
424 | Y1>>=19;\ | ||
425 | Y2>>=19;\ | ||
426 | U >>=19;\ | ||
427 | V >>=19;\ | ||
428 | if ((Y1|Y2|U|V)&256)\ | ||
429 | {\ | ||
430 | if (Y1>255) Y1=255; \ | ||
431 | else if (Y1<0)Y1=0; \ | ||
432 | if (Y2>255) Y2=255; \ | ||
433 | else if (Y2<0)Y2=0; \ | ||
434 | if (U>255) U=255; \ | ||
435 | else if (U<0) U=0; \ | ||
436 | if (V>255) V=255; \ | ||
437 | else if (V<0) V=0; \ | ||
438 | } | ||
439 | |||
440 | #define YSCALE_YUV_2_RGBX_C(type) \ | ||
441 | YSCALE_YUV_2_PACKEDX_C(type) \ | ||
442 | r = (type *)c->table_rV[V]; \ | ||
443 | g = (type *)(c->table_gU[U] + c->table_gV[V]); \ | ||
444 | b = (type *)c->table_bU[U]; \ | ||
445 | |||
446 | #define YSCALE_YUV_2_PACKED2_C \ | ||
447 | for (i=0; i<(dstW>>1); i++){ \ | ||
448 | const int i2= 2*i; \ | ||
449 | int Y1= (buf0[i2 ]*yalpha1+buf1[i2 ]*yalpha)>>19; \ | ||
450 | int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19; \ | ||
451 | int U= (uvbuf0[i ]*uvalpha1+uvbuf1[i ]*uvalpha)>>19; \ | ||
452 | int V= (uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19; \ | ||
453 | |||
454 | #define YSCALE_YUV_2_RGB2_C(type) \ | ||
455 | YSCALE_YUV_2_PACKED2_C\ | ||
456 | type *r, *b, *g;\ | ||
457 | r = (type *)c->table_rV[V];\ | ||
458 | g = (type *)(c->table_gU[U] + c->table_gV[V]);\ | ||
459 | b = (type *)c->table_bU[U];\ | ||
460 | |||
461 | #define YSCALE_YUV_2_PACKED1_C \ | ||
462 | for (i=0; i<(dstW>>1); i++){\ | ||
463 | const int i2= 2*i;\ | ||
464 | int Y1= buf0[i2 ]>>7;\ | ||
465 | int Y2= buf0[i2+1]>>7;\ | ||
466 | int U= (uvbuf1[i ])>>7;\ | ||
467 | int V= (uvbuf1[i+VOFW])>>7;\ | ||
468 | |||
469 | #define YSCALE_YUV_2_RGB1_C(type) \ | ||
470 | YSCALE_YUV_2_PACKED1_C\ | ||
471 | type *r, *b, *g;\ | ||
472 | r = (type *)c->table_rV[V];\ | ||
473 | g = (type *)(c->table_gU[U] + c->table_gV[V]);\ | ||
474 | b = (type *)c->table_bU[U];\ | ||
475 | |||
476 | #define YSCALE_YUV_2_PACKED1B_C \ | ||
477 | for (i=0; i<(dstW>>1); i++){\ | ||
478 | const int i2= 2*i;\ | ||
479 | int Y1= buf0[i2 ]>>7;\ | ||
480 | int Y2= buf0[i2+1]>>7;\ | ||
481 | int U= (uvbuf0[i ] + uvbuf1[i ])>>8;\ | ||
482 | int V= (uvbuf0[i+VOFW] + uvbuf1[i+VOFW])>>8;\ | ||
483 | |||
484 | #define YSCALE_YUV_2_RGB1B_C(type) \ | ||
485 | YSCALE_YUV_2_PACKED1B_C\ | ||
486 | type *r, *b, *g;\ | ||
487 | r = (type *)c->table_rV[V];\ | ||
488 | g = (type *)(c->table_gU[U] + c->table_gV[V]);\ | ||
489 | b = (type *)c->table_bU[U];\ | ||
490 | |||
491 | #define YSCALE_YUV_2_ANYRGB_C(func, func2)\ | ||
492 | switch(c->dstFormat)\ | ||
493 | {\ | ||
494 | case PIX_FMT_RGB32:\ | ||
495 | case PIX_FMT_BGR32:\ | ||
496 | func(uint32_t)\ | ||
497 | ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\ | ||
498 | ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\ | ||
499 | } \ | ||
500 | break;\ | ||
501 | case PIX_FMT_RGB24:\ | ||
502 | func(uint8_t)\ | ||
503 | ((uint8_t*)dest)[0]= r[Y1];\ | ||
504 | ((uint8_t*)dest)[1]= g[Y1];\ | ||
505 | ((uint8_t*)dest)[2]= b[Y1];\ | ||
506 | ((uint8_t*)dest)[3]= r[Y2];\ | ||
507 | ((uint8_t*)dest)[4]= g[Y2];\ | ||
508 | ((uint8_t*)dest)[5]= b[Y2];\ | ||
509 | dest+=6;\ | ||
510 | }\ | ||
511 | break;\ | ||
512 | case PIX_FMT_BGR24:\ | ||
513 | func(uint8_t)\ | ||
514 | ((uint8_t*)dest)[0]= b[Y1];\ | ||
515 | ((uint8_t*)dest)[1]= g[Y1];\ | ||
516 | ((uint8_t*)dest)[2]= r[Y1];\ | ||
517 | ((uint8_t*)dest)[3]= b[Y2];\ | ||
518 | ((uint8_t*)dest)[4]= g[Y2];\ | ||
519 | ((uint8_t*)dest)[5]= r[Y2];\ | ||
520 | dest+=6;\ | ||
521 | }\ | ||
522 | break;\ | ||
523 | case PIX_FMT_RGB565:\ | ||
524 | case PIX_FMT_BGR565:\ | ||
525 | {\ | ||
526 | const int dr1= dither_2x2_8[y&1 ][0];\ | ||
527 | const int dg1= dither_2x2_4[y&1 ][0];\ | ||
528 | const int db1= dither_2x2_8[(y&1)^1][0];\ | ||
529 | const int dr2= dither_2x2_8[y&1 ][1];\ | ||
530 | const int dg2= dither_2x2_4[y&1 ][1];\ | ||
531 | const int db2= dither_2x2_8[(y&1)^1][1];\ | ||
532 | func(uint16_t)\ | ||
533 | ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\ | ||
534 | ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\ | ||
535 | }\ | ||
536 | }\ | ||
537 | break;\ | ||
538 | case PIX_FMT_RGB555:\ | ||
539 | case PIX_FMT_BGR555:\ | ||
540 | {\ | ||
541 | const int dr1= dither_2x2_8[y&1 ][0];\ | ||
542 | const int dg1= dither_2x2_8[y&1 ][1];\ | ||
543 | const int db1= dither_2x2_8[(y&1)^1][0];\ | ||
544 | const int dr2= dither_2x2_8[y&1 ][1];\ | ||
545 | const int dg2= dither_2x2_8[y&1 ][0];\ | ||
546 | const int db2= dither_2x2_8[(y&1)^1][1];\ | ||
547 | func(uint16_t)\ | ||
548 | ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\ | ||
549 | ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\ | ||
550 | }\ | ||
551 | }\ | ||
552 | break;\ | ||
553 | case PIX_FMT_RGB8:\ | ||
554 | case PIX_FMT_BGR8:\ | ||
555 | {\ | ||
556 | const uint8_t * const d64= dither_8x8_73[y&7];\ | ||
557 | const uint8_t * const d32= dither_8x8_32[y&7];\ | ||
558 | func(uint8_t)\ | ||
559 | ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\ | ||
560 | ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\ | ||
561 | }\ | ||
562 | }\ | ||
563 | break;\ | ||
564 | case PIX_FMT_RGB4:\ | ||
565 | case PIX_FMT_BGR4:\ | ||
566 | {\ | ||
567 | const uint8_t * const d64= dither_8x8_73 [y&7];\ | ||
568 | const uint8_t * const d128=dither_8x8_220[y&7];\ | ||
569 | func(uint8_t)\ | ||
570 | ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\ | ||
571 | + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\ | ||
572 | }\ | ||
573 | }\ | ||
574 | break;\ | ||
575 | case PIX_FMT_RGB4_BYTE:\ | ||
576 | case PIX_FMT_BGR4_BYTE:\ | ||
577 | {\ | ||
578 | const uint8_t * const d64= dither_8x8_73 [y&7];\ | ||
579 | const uint8_t * const d128=dither_8x8_220[y&7];\ | ||
580 | func(uint8_t)\ | ||
581 | ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\ | ||
582 | ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\ | ||
583 | }\ | ||
584 | }\ | ||
585 | break;\ | ||
586 | case PIX_FMT_MONOBLACK:\ | ||
587 | {\ | ||
588 | const uint8_t * const d128=dither_8x8_220[y&7];\ | ||
589 | uint8_t *g= c->table_gU[128] + c->table_gV[128];\ | ||
590 | for (i=0; i<dstW-7; i+=8){\ | ||
591 | int acc;\ | ||
592 | acc = g[((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19) + d128[0]];\ | ||
593 | acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\ | ||
594 | acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\ | ||
595 | acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\ | ||
596 | acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\ | ||
597 | acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\ | ||
598 | acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\ | ||
599 | acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\ | ||
600 | ((uint8_t*)dest)[0]= acc;\ | ||
601 | dest++;\ | ||
602 | }\ | ||
603 | \ | ||
604 | /*\ | ||
605 | ((uint8_t*)dest)-= dstW>>4;\ | ||
606 | {\ | ||
607 | int acc=0;\ | ||
608 | int left=0;\ | ||
609 | static int top[1024];\ | ||
610 | static int last_new[1024][1024];\ | ||
611 | static int last_in3[1024][1024];\ | ||
612 | static int drift[1024][1024];\ | ||
613 | int topLeft=0;\ | ||
614 | int shift=0;\ | ||
615 | int count=0;\ | ||
616 | const uint8_t * const d128=dither_8x8_220[y&7];\ | ||
617 | int error_new=0;\ | ||
618 | int error_in3=0;\ | ||
619 | int f=0;\ | ||
620 | \ | ||
621 | for (i=dstW>>1; i<dstW; i++){\ | ||
622 | int in= ((buf0[i ]*yalpha1+buf1[i ]*yalpha)>>19);\ | ||
623 | int in2 = (76309 * (in - 16) + 32768) >> 16;\ | ||
624 | int in3 = (in2 < 0) ? 0 : ((in2 > 255) ? 255 : in2);\ | ||
625 | int old= (left*7 + topLeft + top[i]*5 + top[i+1]*3)/20 + in3\ | ||
626 | + (last_new[y][i] - in3)*f/256;\ | ||
627 | int new= old> 128 ? 255 : 0;\ | ||
628 | \ | ||
629 | error_new+= FFABS(last_new[y][i] - new);\ | ||
630 | error_in3+= FFABS(last_in3[y][i] - in3);\ | ||
631 | f= error_new - error_in3*4;\ | ||
632 | if (f<0) f=0;\ | ||
633 | if (f>256) f=256;\ | ||
634 | \ | ||
635 | topLeft= top[i];\ | ||
636 | left= top[i]= old - new;\ | ||
637 | last_new[y][i]= new;\ | ||
638 | last_in3[y][i]= in3;\ | ||
639 | \ | ||
640 | acc+= acc + (new&1);\ | ||
641 | if ((i&7)==6){\ | ||
642 | ((uint8_t*)dest)[0]= acc;\ | ||
643 | ((uint8_t*)dest)++;\ | ||
644 | }\ | ||
645 | }\ | ||
646 | }\ | ||
647 | */\ | ||
648 | }\ | ||
649 | break;\ | ||
650 | case PIX_FMT_YUYV422:\ | ||
651 | func2\ | ||
652 | ((uint8_t*)dest)[2*i2+0]= Y1;\ | ||
653 | ((uint8_t*)dest)[2*i2+1]= U;\ | ||
654 | ((uint8_t*)dest)[2*i2+2]= Y2;\ | ||
655 | ((uint8_t*)dest)[2*i2+3]= V;\ | ||
656 | } \ | ||
657 | break;\ | ||
658 | case PIX_FMT_UYVY422:\ | ||
659 | func2\ | ||
660 | ((uint8_t*)dest)[2*i2+0]= U;\ | ||
661 | ((uint8_t*)dest)[2*i2+1]= Y1;\ | ||
662 | ((uint8_t*)dest)[2*i2+2]= V;\ | ||
663 | ((uint8_t*)dest)[2*i2+3]= Y2;\ | ||
664 | } \ | ||
665 | break;\ | ||
666 | }\ | ||
667 | |||
668 | |||
669 | static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
670 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
671 | uint8_t *dest, int dstW, int y) | ||
672 | { | ||
673 | int i; | ||
674 | switch(c->dstFormat) | ||
675 | { | ||
676 | case PIX_FMT_BGR32: | ||
677 | case PIX_FMT_RGB32: | ||
678 | YSCALE_YUV_2_RGBX_C(uint32_t) | ||
679 | ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1]; | ||
680 | ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2]; | ||
681 | } | ||
682 | break; | ||
683 | case PIX_FMT_RGB24: | ||
684 | YSCALE_YUV_2_RGBX_C(uint8_t) | ||
685 | ((uint8_t*)dest)[0]= r[Y1]; | ||
686 | ((uint8_t*)dest)[1]= g[Y1]; | ||
687 | ((uint8_t*)dest)[2]= b[Y1]; | ||
688 | ((uint8_t*)dest)[3]= r[Y2]; | ||
689 | ((uint8_t*)dest)[4]= g[Y2]; | ||
690 | ((uint8_t*)dest)[5]= b[Y2]; | ||
691 | dest+=6; | ||
692 | } | ||
693 | break; | ||
694 | case PIX_FMT_BGR24: | ||
695 | YSCALE_YUV_2_RGBX_C(uint8_t) | ||
696 | ((uint8_t*)dest)[0]= b[Y1]; | ||
697 | ((uint8_t*)dest)[1]= g[Y1]; | ||
698 | ((uint8_t*)dest)[2]= r[Y1]; | ||
699 | ((uint8_t*)dest)[3]= b[Y2]; | ||
700 | ((uint8_t*)dest)[4]= g[Y2]; | ||
701 | ((uint8_t*)dest)[5]= r[Y2]; | ||
702 | dest+=6; | ||
703 | } | ||
704 | break; | ||
705 | case PIX_FMT_RGB565: | ||
706 | case PIX_FMT_BGR565: | ||
707 | { | ||
708 | const int dr1= dither_2x2_8[y&1 ][0]; | ||
709 | const int dg1= dither_2x2_4[y&1 ][0]; | ||
710 | const int db1= dither_2x2_8[(y&1)^1][0]; | ||
711 | const int dr2= dither_2x2_8[y&1 ][1]; | ||
712 | const int dg2= dither_2x2_4[y&1 ][1]; | ||
713 | const int db2= dither_2x2_8[(y&1)^1][1]; | ||
714 | YSCALE_YUV_2_RGBX_C(uint16_t) | ||
715 | ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1]; | ||
716 | ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2]; | ||
717 | } | ||
718 | } | ||
719 | break; | ||
720 | case PIX_FMT_RGB555: | ||
721 | case PIX_FMT_BGR555: | ||
722 | { | ||
723 | const int dr1= dither_2x2_8[y&1 ][0]; | ||
724 | const int dg1= dither_2x2_8[y&1 ][1]; | ||
725 | const int db1= dither_2x2_8[(y&1)^1][0]; | ||
726 | const int dr2= dither_2x2_8[y&1 ][1]; | ||
727 | const int dg2= dither_2x2_8[y&1 ][0]; | ||
728 | const int db2= dither_2x2_8[(y&1)^1][1]; | ||
729 | YSCALE_YUV_2_RGBX_C(uint16_t) | ||
730 | ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1]; | ||
731 | ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2]; | ||
732 | } | ||
733 | } | ||
734 | break; | ||
735 | case PIX_FMT_RGB8: | ||
736 | case PIX_FMT_BGR8: | ||
737 | { | ||
738 | const uint8_t * const d64= dither_8x8_73[y&7]; | ||
739 | const uint8_t * const d32= dither_8x8_32[y&7]; | ||
740 | YSCALE_YUV_2_RGBX_C(uint8_t) | ||
741 | ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]]; | ||
742 | ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]]; | ||
743 | } | ||
744 | } | ||
745 | break; | ||
746 | case PIX_FMT_RGB4: | ||
747 | case PIX_FMT_BGR4: | ||
748 | { | ||
749 | const uint8_t * const d64= dither_8x8_73 [y&7]; | ||
750 | const uint8_t * const d128=dither_8x8_220[y&7]; | ||
751 | YSCALE_YUV_2_RGBX_C(uint8_t) | ||
752 | ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]] | ||
753 | +((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4); | ||
754 | } | ||
755 | } | ||
756 | break; | ||
757 | case PIX_FMT_RGB4_BYTE: | ||
758 | case PIX_FMT_BGR4_BYTE: | ||
759 | { | ||
760 | const uint8_t * const d64= dither_8x8_73 [y&7]; | ||
761 | const uint8_t * const d128=dither_8x8_220[y&7]; | ||
762 | YSCALE_YUV_2_RGBX_C(uint8_t) | ||
763 | ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]; | ||
764 | ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]]; | ||
765 | } | ||
766 | } | ||
767 | break; | ||
768 | case PIX_FMT_MONOBLACK: | ||
769 | { | ||
770 | const uint8_t * const d128=dither_8x8_220[y&7]; | ||
771 | uint8_t *g= c->table_gU[128] + c->table_gV[128]; | ||
772 | int acc=0; | ||
773 | for (i=0; i<dstW-1; i+=2){ | ||
774 | int j; | ||
775 | int Y1=1<<18; | ||
776 | int Y2=1<<18; | ||
777 | |||
778 | for (j=0; j<lumFilterSize; j++) | ||
779 | { | ||
780 | Y1 += lumSrc[j][i] * lumFilter[j]; | ||
781 | Y2 += lumSrc[j][i+1] * lumFilter[j]; | ||
782 | } | ||
783 | Y1>>=19; | ||
784 | Y2>>=19; | ||
785 | if ((Y1|Y2)&256) | ||
786 | { | ||
787 | if (Y1>255) Y1=255; | ||
788 | else if (Y1<0)Y1=0; | ||
789 | if (Y2>255) Y2=255; | ||
790 | else if (Y2<0)Y2=0; | ||
791 | } | ||
792 | acc+= acc + g[Y1+d128[(i+0)&7]]; | ||
793 | acc+= acc + g[Y2+d128[(i+1)&7]]; | ||
794 | if ((i&7)==6){ | ||
795 | ((uint8_t*)dest)[0]= acc; | ||
796 | dest++; | ||
797 | } | ||
798 | } | ||
799 | } | ||
800 | break; | ||
801 | case PIX_FMT_YUYV422: | ||
802 | YSCALE_YUV_2_PACKEDX_C(void) | ||
803 | ((uint8_t*)dest)[2*i2+0]= Y1; | ||
804 | ((uint8_t*)dest)[2*i2+1]= U; | ||
805 | ((uint8_t*)dest)[2*i2+2]= Y2; | ||
806 | ((uint8_t*)dest)[2*i2+3]= V; | ||
807 | } | ||
808 | break; | ||
809 | case PIX_FMT_UYVY422: | ||
810 | YSCALE_YUV_2_PACKEDX_C(void) | ||
811 | ((uint8_t*)dest)[2*i2+0]= U; | ||
812 | ((uint8_t*)dest)[2*i2+1]= Y1; | ||
813 | ((uint8_t*)dest)[2*i2+2]= V; | ||
814 | ((uint8_t*)dest)[2*i2+3]= Y2; | ||
815 | } | ||
816 | break; | ||
817 | } | ||
818 | } | ||
819 | |||
820 | |||
821 | //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one | ||
822 | //Plain C versions | ||
823 | #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) || !defined(CONFIG_GPL) | ||
824 | #define COMPILE_C | ||
825 | #endif | ||
826 | |||
827 | #ifdef ARCH_POWERPC | ||
828 | #if (defined (HAVE_ALTIVEC) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL) | ||
829 | #define COMPILE_ALTIVEC | ||
830 | #endif //HAVE_ALTIVEC | ||
831 | #endif //ARCH_POWERPC | ||
832 | |||
833 | #if defined(ARCH_X86) | ||
834 | |||
835 | #if ((defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL) | ||
836 | #define COMPILE_MMX | ||
837 | #endif | ||
838 | |||
839 | #if (defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL) | ||
840 | #define COMPILE_MMX2 | ||
841 | #endif | ||
842 | |||
843 | #if ((defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL) | ||
844 | #define COMPILE_3DNOW | ||
845 | #endif | ||
846 | #endif //ARCH_X86 || ARCH_X86_64 | ||
847 | |||
848 | #undef HAVE_MMX | ||
849 | #undef HAVE_MMX2 | ||
850 | #undef HAVE_3DNOW | ||
851 | |||
852 | #ifdef COMPILE_C | ||
853 | #undef HAVE_MMX | ||
854 | #undef HAVE_MMX2 | ||
855 | #undef HAVE_3DNOW | ||
856 | #undef HAVE_ALTIVEC | ||
857 | #define RENAME(a) a ## _C | ||
858 | #include "swscale_template.c" | ||
859 | #endif | ||
860 | |||
861 | #ifdef COMPILE_ALTIVEC | ||
862 | #undef RENAME | ||
863 | #define HAVE_ALTIVEC | ||
864 | #define RENAME(a) a ## _altivec | ||
865 | #include "swscale_template.c" | ||
866 | #endif | ||
867 | |||
868 | #if defined(ARCH_X86) | ||
869 | |||
870 | //X86 versions | ||
871 | /* | ||
872 | #undef RENAME | ||
873 | #undef HAVE_MMX | ||
874 | #undef HAVE_MMX2 | ||
875 | #undef HAVE_3DNOW | ||
876 | #define ARCH_X86 | ||
877 | #define RENAME(a) a ## _X86 | ||
878 | #include "swscale_template.c" | ||
879 | */ | ||
880 | //MMX versions | ||
881 | #ifdef COMPILE_MMX | ||
882 | #undef RENAME | ||
883 | #define HAVE_MMX | ||
884 | #undef HAVE_MMX2 | ||
885 | #undef HAVE_3DNOW | ||
886 | #define RENAME(a) a ## _MMX | ||
887 | #include "swscale_template.c" | ||
888 | #endif | ||
889 | |||
890 | //MMX2 versions | ||
891 | #ifdef COMPILE_MMX2 | ||
892 | #undef RENAME | ||
893 | #define HAVE_MMX | ||
894 | #define HAVE_MMX2 | ||
895 | #undef HAVE_3DNOW | ||
896 | #define RENAME(a) a ## _MMX2 | ||
897 | #include "swscale_template.c" | ||
898 | #endif | ||
899 | |||
900 | //3DNOW versions | ||
901 | #ifdef COMPILE_3DNOW | ||
902 | #undef RENAME | ||
903 | #define HAVE_MMX | ||
904 | #undef HAVE_MMX2 | ||
905 | #define HAVE_3DNOW | ||
906 | #define RENAME(a) a ## _3DNow | ||
907 | #include "swscale_template.c" | ||
908 | #endif | ||
909 | |||
910 | #endif //ARCH_X86 || ARCH_X86_64 | ||
911 | |||
912 | // minor note: the HAVE_xyz is messed up after that line so don't use it | ||
913 | |||
914 | static double getSplineCoeff(double a, double b, double c, double d, double dist) | ||
915 | { | ||
916 | // printf("%f %f %f %f %f\n", a,b,c,d,dist); | ||
917 | if (dist<=1.0) return ((d*dist + c)*dist + b)*dist +a; | ||
918 | else return getSplineCoeff( 0.0, | ||
919 | b+ 2.0*c + 3.0*d, | ||
920 | c + 3.0*d, | ||
921 | -b- 3.0*c - 6.0*d, | ||
922 | dist-1.0); | ||
923 | } | ||
924 | |||
925 | static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc, | ||
926 | int srcW, int dstW, int filterAlign, int one, int flags, | ||
927 | SwsVector *srcFilter, SwsVector *dstFilter, double param[2]) | ||
928 | { | ||
929 | int i; | ||
930 | int filterSize; | ||
931 | int filter2Size; | ||
932 | int minFilterSize; | ||
933 | double *filter=NULL; | ||
934 | double *filter2=NULL; | ||
935 | int ret= -1; | ||
936 | #if defined(ARCH_X86) | ||
937 | if (flags & SWS_CPU_CAPS_MMX) | ||
938 | asm volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions) | ||
939 | #endif | ||
940 | |||
941 | // Note the +1 is for the MMXscaler which reads over the end | ||
942 | *filterPos = av_malloc((dstW+1)*sizeof(int16_t)); | ||
943 | |||
944 | if (FFABS(xInc - 0x10000) <10) // unscaled | ||
945 | { | ||
946 | int i; | ||
947 | filterSize= 1; | ||
948 | filter= av_malloc(dstW*sizeof(double)*filterSize); | ||
949 | for (i=0; i<dstW*filterSize; i++) filter[i]=0; | ||
950 | |||
951 | for (i=0; i<dstW; i++) | ||
952 | { | ||
953 | filter[i*filterSize]=1; | ||
954 | (*filterPos)[i]=i; | ||
955 | } | ||
956 | |||
957 | } | ||
958 | else if (flags&SWS_POINT) // lame looking point sampling mode | ||
959 | { | ||
960 | int i; | ||
961 | int xDstInSrc; | ||
962 | filterSize= 1; | ||
963 | filter= av_malloc(dstW*sizeof(double)*filterSize); | ||
964 | |||
965 | xDstInSrc= xInc/2 - 0x8000; | ||
966 | for (i=0; i<dstW; i++) | ||
967 | { | ||
968 | int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16; | ||
969 | |||
970 | (*filterPos)[i]= xx; | ||
971 | filter[i]= 1.0; | ||
972 | xDstInSrc+= xInc; | ||
973 | } | ||
974 | } | ||
975 | else if ((xInc <= (1<<16) && (flags&SWS_AREA)) || (flags&SWS_FAST_BILINEAR)) // bilinear upscale | ||
976 | { | ||
977 | int i; | ||
978 | int xDstInSrc; | ||
979 | if (flags&SWS_BICUBIC) filterSize= 4; | ||
980 | else if (flags&SWS_X ) filterSize= 4; | ||
981 | else filterSize= 2; // SWS_BILINEAR / SWS_AREA | ||
982 | filter= av_malloc(dstW*sizeof(double)*filterSize); | ||
983 | |||
984 | xDstInSrc= xInc/2 - 0x8000; | ||
985 | for (i=0; i<dstW; i++) | ||
986 | { | ||
987 | int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16; | ||
988 | int j; | ||
989 | |||
990 | (*filterPos)[i]= xx; | ||
991 | //Bilinear upscale / linear interpolate / Area averaging | ||
992 | for (j=0; j<filterSize; j++) | ||
993 | { | ||
994 | double d= FFABS((xx<<16) - xDstInSrc)/(double)(1<<16); | ||
995 | double coeff= 1.0 - d; | ||
996 | if (coeff<0) coeff=0; | ||
997 | filter[i*filterSize + j]= coeff; | ||
998 | xx++; | ||
999 | } | ||
1000 | xDstInSrc+= xInc; | ||
1001 | } | ||
1002 | } | ||
1003 | else | ||
1004 | { | ||
1005 | double xDstInSrc; | ||
1006 | double sizeFactor, filterSizeInSrc; | ||
1007 | const double xInc1= (double)xInc / (double)(1<<16); | ||
1008 | |||
1009 | if (flags&SWS_BICUBIC) sizeFactor= 4.0; | ||
1010 | else if (flags&SWS_X) sizeFactor= 8.0; | ||
1011 | else if (flags&SWS_AREA) sizeFactor= 1.0; //downscale only, for upscale it is bilinear | ||
1012 | else if (flags&SWS_GAUSS) sizeFactor= 8.0; // infinite ;) | ||
1013 | else if (flags&SWS_LANCZOS) sizeFactor= param[0] != SWS_PARAM_DEFAULT ? 2.0*param[0] : 6.0; | ||
1014 | else if (flags&SWS_SINC) sizeFactor= 20.0; // infinite ;) | ||
1015 | else if (flags&SWS_SPLINE) sizeFactor= 20.0; // infinite ;) | ||
1016 | else if (flags&SWS_BILINEAR) sizeFactor= 2.0; | ||
1017 | else { | ||
1018 | sizeFactor= 0.0; //GCC warning killer | ||
1019 | assert(0); | ||
1020 | } | ||
1021 | |||
1022 | if (xInc1 <= 1.0) filterSizeInSrc= sizeFactor; // upscale | ||
1023 | else filterSizeInSrc= sizeFactor*srcW / (double)dstW; | ||
1024 | |||
1025 | filterSize= (int)ceil(1 + filterSizeInSrc); // will be reduced later if possible | ||
1026 | if (filterSize > srcW-2) filterSize=srcW-2; | ||
1027 | |||
1028 | filter= av_malloc(dstW*sizeof(double)*filterSize); | ||
1029 | |||
1030 | xDstInSrc= xInc1 / 2.0 - 0.5; | ||
1031 | for (i=0; i<dstW; i++) | ||
1032 | { | ||
1033 | int xx= (int)(xDstInSrc - (filterSize-1)*0.5 + 0.5); | ||
1034 | int j; | ||
1035 | (*filterPos)[i]= xx; | ||
1036 | for (j=0; j<filterSize; j++) | ||
1037 | { | ||
1038 | double d= FFABS(xx - xDstInSrc)/filterSizeInSrc*sizeFactor; | ||
1039 | double coeff; | ||
1040 | if (flags & SWS_BICUBIC) | ||
1041 | { | ||
1042 | double B= param[0] != SWS_PARAM_DEFAULT ? param[0] : 0.0; | ||
1043 | double C= param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6; | ||
1044 | |||
1045 | if (d<1.0) | ||
1046 | coeff = (12-9*B-6*C)*d*d*d + (-18+12*B+6*C)*d*d + 6-2*B; | ||
1047 | else if (d<2.0) | ||
1048 | coeff = (-B-6*C)*d*d*d + (6*B+30*C)*d*d + (-12*B-48*C)*d +8*B+24*C; | ||
1049 | else | ||
1050 | coeff=0.0; | ||
1051 | } | ||
1052 | /* else if (flags & SWS_X) | ||
1053 | { | ||
1054 | double p= param ? param*0.01 : 0.3; | ||
1055 | coeff = d ? sin(d*PI)/(d*PI) : 1.0; | ||
1056 | coeff*= pow(2.0, - p*d*d); | ||
1057 | }*/ | ||
1058 | else if (flags & SWS_X) | ||
1059 | { | ||
1060 | double A= param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0; | ||
1061 | |||
1062 | if (d<1.0) | ||
1063 | coeff = cos(d*PI); | ||
1064 | else | ||
1065 | coeff=-1.0; | ||
1066 | if (coeff<0.0) coeff= -pow(-coeff, A); | ||
1067 | else coeff= pow( coeff, A); | ||
1068 | coeff= coeff*0.5 + 0.5; | ||
1069 | } | ||
1070 | else if (flags & SWS_AREA) | ||
1071 | { | ||
1072 | double srcPixelSize= 1.0/xInc1; | ||
1073 | if (d + srcPixelSize/2 < 0.5) coeff= 1.0; | ||
1074 | else if (d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5; | ||
1075 | else coeff=0.0; | ||
1076 | } | ||
1077 | else if (flags & SWS_GAUSS) | ||
1078 | { | ||
1079 | double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0; | ||
1080 | coeff = pow(2.0, - p*d*d); | ||
1081 | } | ||
1082 | else if (flags & SWS_SINC) | ||
1083 | { | ||
1084 | coeff = d ? sin(d*PI)/(d*PI) : 1.0; | ||
1085 | } | ||
1086 | else if (flags & SWS_LANCZOS) | ||
1087 | { | ||
1088 | double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0; | ||
1089 | coeff = d ? sin(d*PI)*sin(d*PI/p)/(d*d*PI*PI/p) : 1.0; | ||
1090 | if (d>p) coeff=0; | ||
1091 | } | ||
1092 | else if (flags & SWS_BILINEAR) | ||
1093 | { | ||
1094 | coeff= 1.0 - d; | ||
1095 | if (coeff<0) coeff=0; | ||
1096 | } | ||
1097 | else if (flags & SWS_SPLINE) | ||
1098 | { | ||
1099 | double p=-2.196152422706632; | ||
1100 | coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, d); | ||
1101 | } | ||
1102 | else { | ||
1103 | coeff= 0.0; //GCC warning killer | ||
1104 | assert(0); | ||
1105 | } | ||
1106 | |||
1107 | filter[i*filterSize + j]= coeff; | ||
1108 | xx++; | ||
1109 | } | ||
1110 | xDstInSrc+= xInc1; | ||
1111 | } | ||
1112 | } | ||
1113 | |||
1114 | /* apply src & dst Filter to filter -> filter2 | ||
1115 | av_free(filter); | ||
1116 | */ | ||
1117 | assert(filterSize>0); | ||
1118 | filter2Size= filterSize; | ||
1119 | if (srcFilter) filter2Size+= srcFilter->length - 1; | ||
1120 | if (dstFilter) filter2Size+= dstFilter->length - 1; | ||
1121 | assert(filter2Size>0); | ||
1122 | filter2= av_malloc(filter2Size*dstW*sizeof(double)); | ||
1123 | |||
1124 | for (i=0; i<dstW; i++) | ||
1125 | { | ||
1126 | int j; | ||
1127 | SwsVector scaleFilter; | ||
1128 | SwsVector *outVec; | ||
1129 | |||
1130 | scaleFilter.coeff= filter + i*filterSize; | ||
1131 | scaleFilter.length= filterSize; | ||
1132 | |||
1133 | if (srcFilter) outVec= sws_getConvVec(srcFilter, &scaleFilter); | ||
1134 | else outVec= &scaleFilter; | ||
1135 | |||
1136 | assert(outVec->length == filter2Size); | ||
1137 | //FIXME dstFilter | ||
1138 | |||
1139 | for (j=0; j<outVec->length; j++) | ||
1140 | { | ||
1141 | filter2[i*filter2Size + j]= outVec->coeff[j]; | ||
1142 | } | ||
1143 | |||
1144 | (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2; | ||
1145 | |||
1146 | if (outVec != &scaleFilter) sws_freeVec(outVec); | ||
1147 | } | ||
1148 | av_freep(&filter); | ||
1149 | |||
1150 | /* try to reduce the filter-size (step1 find size and shift left) */ | ||
1151 | // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not). | ||
1152 | minFilterSize= 0; | ||
1153 | for (i=dstW-1; i>=0; i--) | ||
1154 | { | ||
1155 | int min= filter2Size; | ||
1156 | int j; | ||
1157 | double cutOff=0.0; | ||
1158 | |||
1159 | /* get rid off near zero elements on the left by shifting left */ | ||
1160 | for (j=0; j<filter2Size; j++) | ||
1161 | { | ||
1162 | int k; | ||
1163 | cutOff += FFABS(filter2[i*filter2Size]); | ||
1164 | |||
1165 | if (cutOff > SWS_MAX_REDUCE_CUTOFF) break; | ||
1166 | |||
1167 | /* preserve monotonicity because the core can't handle the filter otherwise */ | ||
1168 | if (i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break; | ||
1169 | |||
1170 | // Move filter coeffs left | ||
1171 | for (k=1; k<filter2Size; k++) | ||
1172 | filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k]; | ||
1173 | filter2[i*filter2Size + k - 1]= 0.0; | ||
1174 | (*filterPos)[i]++; | ||
1175 | } | ||
1176 | |||
1177 | cutOff=0.0; | ||
1178 | /* count near zeros on the right */ | ||
1179 | for (j=filter2Size-1; j>0; j--) | ||
1180 | { | ||
1181 | cutOff += FFABS(filter2[i*filter2Size + j]); | ||
1182 | |||
1183 | if (cutOff > SWS_MAX_REDUCE_CUTOFF) break; | ||
1184 | min--; | ||
1185 | } | ||
1186 | |||
1187 | if (min>minFilterSize) minFilterSize= min; | ||
1188 | } | ||
1189 | |||
1190 | if (flags & SWS_CPU_CAPS_ALTIVEC) { | ||
1191 | // we can handle the special case 4, | ||
1192 | // so we don't want to go to the full 8 | ||
1193 | if (minFilterSize < 5) | ||
1194 | filterAlign = 4; | ||
1195 | |||
1196 | // we really don't want to waste our time | ||
1197 | // doing useless computation, so fall-back on | ||
1198 | // the scalar C code for very small filter. | ||
1199 | // vectorizing is worth it only if you have | ||
1200 | // decent-sized vector. | ||
1201 | if (minFilterSize < 3) | ||
1202 | filterAlign = 1; | ||
1203 | } | ||
1204 | |||
1205 | if (flags & SWS_CPU_CAPS_MMX) { | ||
1206 | // special case for unscaled vertical filtering | ||
1207 | if (minFilterSize == 1 && filterAlign == 2) | ||
1208 | filterAlign= 1; | ||
1209 | } | ||
1210 | |||
1211 | assert(minFilterSize > 0); | ||
1212 | filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1)); | ||
1213 | assert(filterSize > 0); | ||
1214 | filter= av_malloc(filterSize*dstW*sizeof(double)); | ||
1215 | if (filterSize >= MAX_FILTER_SIZE || !filter) | ||
1216 | goto error; | ||
1217 | *outFilterSize= filterSize; | ||
1218 | |||
1219 | if (flags&SWS_PRINT_INFO) | ||
1220 | av_log(NULL, AV_LOG_VERBOSE, "SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize); | ||
1221 | /* try to reduce the filter-size (step2 reduce it) */ | ||
1222 | for (i=0; i<dstW; i++) | ||
1223 | { | ||
1224 | int j; | ||
1225 | |||
1226 | for (j=0; j<filterSize; j++) | ||
1227 | { | ||
1228 | if (j>=filter2Size) filter[i*filterSize + j]= 0.0; | ||
1229 | else filter[i*filterSize + j]= filter2[i*filter2Size + j]; | ||
1230 | } | ||
1231 | } | ||
1232 | |||
1233 | |||
1234 | //FIXME try to align filterpos if possible | ||
1235 | |||
1236 | //fix borders | ||
1237 | for (i=0; i<dstW; i++) | ||
1238 | { | ||
1239 | int j; | ||
1240 | if ((*filterPos)[i] < 0) | ||
1241 | { | ||
1242 | // Move filter coeffs left to compensate for filterPos | ||
1243 | for (j=1; j<filterSize; j++) | ||
1244 | { | ||
1245 | int left= FFMAX(j + (*filterPos)[i], 0); | ||
1246 | filter[i*filterSize + left] += filter[i*filterSize + j]; | ||
1247 | filter[i*filterSize + j]=0; | ||
1248 | } | ||
1249 | (*filterPos)[i]= 0; | ||
1250 | } | ||
1251 | |||
1252 | if ((*filterPos)[i] + filterSize > srcW) | ||
1253 | { | ||
1254 | int shift= (*filterPos)[i] + filterSize - srcW; | ||
1255 | // Move filter coeffs right to compensate for filterPos | ||
1256 | for (j=filterSize-2; j>=0; j--) | ||
1257 | { | ||
1258 | int right= FFMIN(j + shift, filterSize-1); | ||
1259 | filter[i*filterSize +right] += filter[i*filterSize +j]; | ||
1260 | filter[i*filterSize +j]=0; | ||
1261 | } | ||
1262 | (*filterPos)[i]= srcW - filterSize; | ||
1263 | } | ||
1264 | } | ||
1265 | |||
1266 | // Note the +1 is for the MMXscaler which reads over the end | ||
1267 | /* align at 16 for AltiVec (needed by hScale_altivec_real) */ | ||
1268 | *outFilter= av_mallocz(*outFilterSize*(dstW+1)*sizeof(int16_t)); | ||
1269 | |||
1270 | /* Normalize & Store in outFilter */ | ||
1271 | for (i=0; i<dstW; i++) | ||
1272 | { | ||
1273 | int j; | ||
1274 | double error=0; | ||
1275 | double sum=0; | ||
1276 | double scale= one; | ||
1277 | |||
1278 | for (j=0; j<filterSize; j++) | ||
1279 | { | ||
1280 | sum+= filter[i*filterSize + j]; | ||
1281 | } | ||
1282 | scale/= sum; | ||
1283 | for (j=0; j<*outFilterSize; j++) | ||
1284 | { | ||
1285 | double v= filter[i*filterSize + j]*scale + error; | ||
1286 | int intV= floor(v + 0.5); | ||
1287 | (*outFilter)[i*(*outFilterSize) + j]= intV; | ||
1288 | error = v - intV; | ||
1289 | } | ||
1290 | } | ||
1291 | |||
1292 | (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end | ||
1293 | for (i=0; i<*outFilterSize; i++) | ||
1294 | { | ||
1295 | int j= dstW*(*outFilterSize); | ||
1296 | (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)]; | ||
1297 | } | ||
1298 | |||
1299 | ret=0; | ||
1300 | error: | ||
1301 | av_free(filter); | ||
1302 | av_free(filter2); | ||
1303 | return ret; | ||
1304 | } | ||
1305 | |||
1306 | #ifdef COMPILE_MMX2 | ||
1307 | static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits) | ||
1308 | { | ||
1309 | uint8_t *fragmentA; | ||
1310 | long imm8OfPShufW1A; | ||
1311 | long imm8OfPShufW2A; | ||
1312 | long fragmentLengthA; | ||
1313 | uint8_t *fragmentB; | ||
1314 | long imm8OfPShufW1B; | ||
1315 | long imm8OfPShufW2B; | ||
1316 | long fragmentLengthB; | ||
1317 | int fragmentPos; | ||
1318 | |||
1319 | int xpos, i; | ||
1320 | |||
1321 | // create an optimized horizontal scaling routine | ||
1322 | |||
1323 | //code fragment | ||
1324 | |||
1325 | asm volatile( | ||
1326 | "jmp 9f \n\t" | ||
1327 | // Begin | ||
1328 | "0: \n\t" | ||
1329 | "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" | ||
1330 | "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" | ||
1331 | "movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" | ||
1332 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1333 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1334 | "pshufw $0xFF, %%mm1, %%mm1 \n\t" | ||
1335 | "1: \n\t" | ||
1336 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" | ||
1337 | "2: \n\t" | ||
1338 | "psubw %%mm1, %%mm0 \n\t" | ||
1339 | "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" | ||
1340 | "pmullw %%mm3, %%mm0 \n\t" | ||
1341 | "psllw $7, %%mm1 \n\t" | ||
1342 | "paddw %%mm1, %%mm0 \n\t" | ||
1343 | |||
1344 | "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" | ||
1345 | |||
1346 | "add $8, %%"REG_a" \n\t" | ||
1347 | // End | ||
1348 | "9: \n\t" | ||
1349 | // "int $3 \n\t" | ||
1350 | "lea " LOCAL_MANGLE(0b) ", %0 \n\t" | ||
1351 | "lea " LOCAL_MANGLE(1b) ", %1 \n\t" | ||
1352 | "lea " LOCAL_MANGLE(2b) ", %2 \n\t" | ||
1353 | "dec %1 \n\t" | ||
1354 | "dec %2 \n\t" | ||
1355 | "sub %0, %1 \n\t" | ||
1356 | "sub %0, %2 \n\t" | ||
1357 | "lea " LOCAL_MANGLE(9b) ", %3 \n\t" | ||
1358 | "sub %0, %3 \n\t" | ||
1359 | |||
1360 | |||
1361 | :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), | ||
1362 | "=r" (fragmentLengthA) | ||
1363 | ); | ||
1364 | |||
1365 | asm volatile( | ||
1366 | "jmp 9f \n\t" | ||
1367 | // Begin | ||
1368 | "0: \n\t" | ||
1369 | "movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" | ||
1370 | "movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" | ||
1371 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1372 | "pshufw $0xFF, %%mm0, %%mm1 \n\t" | ||
1373 | "1: \n\t" | ||
1374 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" | ||
1375 | "2: \n\t" | ||
1376 | "psubw %%mm1, %%mm0 \n\t" | ||
1377 | "movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" | ||
1378 | "pmullw %%mm3, %%mm0 \n\t" | ||
1379 | "psllw $7, %%mm1 \n\t" | ||
1380 | "paddw %%mm1, %%mm0 \n\t" | ||
1381 | |||
1382 | "movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" | ||
1383 | |||
1384 | "add $8, %%"REG_a" \n\t" | ||
1385 | // End | ||
1386 | "9: \n\t" | ||
1387 | // "int $3 \n\t" | ||
1388 | "lea " LOCAL_MANGLE(0b) ", %0 \n\t" | ||
1389 | "lea " LOCAL_MANGLE(1b) ", %1 \n\t" | ||
1390 | "lea " LOCAL_MANGLE(2b) ", %2 \n\t" | ||
1391 | "dec %1 \n\t" | ||
1392 | "dec %2 \n\t" | ||
1393 | "sub %0, %1 \n\t" | ||
1394 | "sub %0, %2 \n\t" | ||
1395 | "lea " LOCAL_MANGLE(9b) ", %3 \n\t" | ||
1396 | "sub %0, %3 \n\t" | ||
1397 | |||
1398 | |||
1399 | :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), | ||
1400 | "=r" (fragmentLengthB) | ||
1401 | ); | ||
1402 | |||
1403 | xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers | ||
1404 | fragmentPos=0; | ||
1405 | |||
1406 | for (i=0; i<dstW/numSplits; i++) | ||
1407 | { | ||
1408 | int xx=xpos>>16; | ||
1409 | |||
1410 | if ((i&3) == 0) | ||
1411 | { | ||
1412 | int a=0; | ||
1413 | int b=((xpos+xInc)>>16) - xx; | ||
1414 | int c=((xpos+xInc*2)>>16) - xx; | ||
1415 | int d=((xpos+xInc*3)>>16) - xx; | ||
1416 | |||
1417 | filter[i ] = (( xpos & 0xFFFF) ^ 0xFFFF)>>9; | ||
1418 | filter[i+1] = (((xpos+xInc ) & 0xFFFF) ^ 0xFFFF)>>9; | ||
1419 | filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9; | ||
1420 | filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9; | ||
1421 | filterPos[i/2]= xx; | ||
1422 | |||
1423 | if (d+1<4) | ||
1424 | { | ||
1425 | int maxShift= 3-(d+1); | ||
1426 | int shift=0; | ||
1427 | |||
1428 | memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB); | ||
1429 | |||
1430 | funnyCode[fragmentPos + imm8OfPShufW1B]= | ||
1431 | (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6); | ||
1432 | funnyCode[fragmentPos + imm8OfPShufW2B]= | ||
1433 | a | (b<<2) | (c<<4) | (d<<6); | ||
1434 | |||
1435 | if (i+3>=dstW) shift=maxShift; //avoid overread | ||
1436 | else if ((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align | ||
1437 | |||
1438 | if (shift && i>=shift) | ||
1439 | { | ||
1440 | funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift; | ||
1441 | funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift; | ||
1442 | filterPos[i/2]-=shift; | ||
1443 | } | ||
1444 | |||
1445 | fragmentPos+= fragmentLengthB; | ||
1446 | } | ||
1447 | else | ||
1448 | { | ||
1449 | int maxShift= 3-d; | ||
1450 | int shift=0; | ||
1451 | |||
1452 | memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA); | ||
1453 | |||
1454 | funnyCode[fragmentPos + imm8OfPShufW1A]= | ||
1455 | funnyCode[fragmentPos + imm8OfPShufW2A]= | ||
1456 | a | (b<<2) | (c<<4) | (d<<6); | ||
1457 | |||
1458 | if (i+4>=dstW) shift=maxShift; //avoid overread | ||
1459 | else if ((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align | ||
1460 | |||
1461 | if (shift && i>=shift) | ||
1462 | { | ||
1463 | funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift; | ||
1464 | funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift; | ||
1465 | filterPos[i/2]-=shift; | ||
1466 | } | ||
1467 | |||
1468 | fragmentPos+= fragmentLengthA; | ||
1469 | } | ||
1470 | |||
1471 | funnyCode[fragmentPos]= RET; | ||
1472 | } | ||
1473 | xpos+=xInc; | ||
1474 | } | ||
1475 | filterPos[i/2]= xpos>>16; // needed to jump to the next part | ||
1476 | } | ||
1477 | #endif /* COMPILE_MMX2 */ | ||
1478 | |||
1479 | static void globalInit(void){ | ||
1480 | // generating tables: | ||
1481 | int i; | ||
1482 | for (i=0; i<768; i++){ | ||
1483 | int c= av_clip_uint8(i-256); | ||
1484 | clip_table[i]=c; | ||
1485 | } | ||
1486 | } | ||
1487 | |||
1488 | static SwsFunc getSwsFunc(int flags){ | ||
1489 | |||
1490 | #if defined(RUNTIME_CPUDETECT) && defined (CONFIG_GPL) | ||
1491 | #if defined(ARCH_X86) | ||
1492 | // ordered per speed fastest first | ||
1493 | if (flags & SWS_CPU_CAPS_MMX2) | ||
1494 | return swScale_MMX2; | ||
1495 | else if (flags & SWS_CPU_CAPS_3DNOW) | ||
1496 | return swScale_3DNow; | ||
1497 | else if (flags & SWS_CPU_CAPS_MMX) | ||
1498 | return swScale_MMX; | ||
1499 | else | ||
1500 | return swScale_C; | ||
1501 | |||
1502 | #else | ||
1503 | #ifdef ARCH_POWERPC | ||
1504 | if (flags & SWS_CPU_CAPS_ALTIVEC) | ||
1505 | return swScale_altivec; | ||
1506 | else | ||
1507 | return swScale_C; | ||
1508 | #endif | ||
1509 | return swScale_C; | ||
1510 | #endif /* defined(ARCH_X86) */ | ||
1511 | #else //RUNTIME_CPUDETECT | ||
1512 | #ifdef HAVE_MMX2 | ||
1513 | return swScale_MMX2; | ||
1514 | #elif defined (HAVE_3DNOW) | ||
1515 | return swScale_3DNow; | ||
1516 | #elif defined (HAVE_MMX) | ||
1517 | return swScale_MMX; | ||
1518 | #elif defined (HAVE_ALTIVEC) | ||
1519 | return swScale_altivec; | ||
1520 | #else | ||
1521 | return swScale_C; | ||
1522 | #endif | ||
1523 | #endif //!RUNTIME_CPUDETECT | ||
1524 | } | ||
1525 | |||
1526 | static int PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1527 | int srcSliceH, uint8_t* dstParam[], int dstStride[]){ | ||
1528 | uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY; | ||
1529 | /* Copy Y plane */ | ||
1530 | if (dstStride[0]==srcStride[0] && srcStride[0] > 0) | ||
1531 | memcpy(dst, src[0], srcSliceH*dstStride[0]); | ||
1532 | else | ||
1533 | { | ||
1534 | int i; | ||
1535 | uint8_t *srcPtr= src[0]; | ||
1536 | uint8_t *dstPtr= dst; | ||
1537 | for (i=0; i<srcSliceH; i++) | ||
1538 | { | ||
1539 | memcpy(dstPtr, srcPtr, c->srcW); | ||
1540 | srcPtr+= srcStride[0]; | ||
1541 | dstPtr+= dstStride[0]; | ||
1542 | } | ||
1543 | } | ||
1544 | dst = dstParam[1] + dstStride[1]*srcSliceY/2; | ||
1545 | if (c->dstFormat == PIX_FMT_NV12) | ||
1546 | interleaveBytes(src[1], src[2], dst, c->srcW/2, srcSliceH/2, srcStride[1], srcStride[2], dstStride[0]); | ||
1547 | else | ||
1548 | interleaveBytes(src[2], src[1], dst, c->srcW/2, srcSliceH/2, srcStride[2], srcStride[1], dstStride[0]); | ||
1549 | |||
1550 | return srcSliceH; | ||
1551 | } | ||
1552 | |||
1553 | static int PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1554 | int srcSliceH, uint8_t* dstParam[], int dstStride[]){ | ||
1555 | uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY; | ||
1556 | |||
1557 | yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]); | ||
1558 | |||
1559 | return srcSliceH; | ||
1560 | } | ||
1561 | |||
1562 | static int PlanarToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1563 | int srcSliceH, uint8_t* dstParam[], int dstStride[]){ | ||
1564 | uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY; | ||
1565 | |||
1566 | yv12touyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]); | ||
1567 | |||
1568 | return srcSliceH; | ||
1569 | } | ||
1570 | |||
1571 | /* {RGB,BGR}{15,16,24,32} -> {RGB,BGR}{15,16,24,32} */ | ||
1572 | static int rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1573 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
1574 | const int srcFormat= c->srcFormat; | ||
1575 | const int dstFormat= c->dstFormat; | ||
1576 | const int srcBpp= (fmt_depth(srcFormat) + 7) >> 3; | ||
1577 | const int dstBpp= (fmt_depth(dstFormat) + 7) >> 3; | ||
1578 | const int srcId= fmt_depth(srcFormat) >> 2; /* 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8 */ | ||
1579 | const int dstId= fmt_depth(dstFormat) >> 2; | ||
1580 | void (*conv)(const uint8_t *src, uint8_t *dst, long src_size)=NULL; | ||
1581 | |||
1582 | /* BGR -> BGR */ | ||
1583 | if ( (isBGR(srcFormat) && isBGR(dstFormat)) | ||
1584 | || (isRGB(srcFormat) && isRGB(dstFormat))){ | ||
1585 | switch(srcId | (dstId<<4)){ | ||
1586 | case 0x34: conv= rgb16to15; break; | ||
1587 | case 0x36: conv= rgb24to15; break; | ||
1588 | case 0x38: conv= rgb32to15; break; | ||
1589 | case 0x43: conv= rgb15to16; break; | ||
1590 | case 0x46: conv= rgb24to16; break; | ||
1591 | case 0x48: conv= rgb32to16; break; | ||
1592 | case 0x63: conv= rgb15to24; break; | ||
1593 | case 0x64: conv= rgb16to24; break; | ||
1594 | case 0x68: conv= rgb32to24; break; | ||
1595 | case 0x83: conv= rgb15to32; break; | ||
1596 | case 0x84: conv= rgb16to32; break; | ||
1597 | case 0x86: conv= rgb24to32; break; | ||
1598 | default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n", | ||
1599 | sws_format_name(srcFormat), sws_format_name(dstFormat)); break; | ||
1600 | } | ||
1601 | }else if ( (isBGR(srcFormat) && isRGB(dstFormat)) | ||
1602 | || (isRGB(srcFormat) && isBGR(dstFormat))){ | ||
1603 | switch(srcId | (dstId<<4)){ | ||
1604 | case 0x33: conv= rgb15tobgr15; break; | ||
1605 | case 0x34: conv= rgb16tobgr15; break; | ||
1606 | case 0x36: conv= rgb24tobgr15; break; | ||
1607 | case 0x38: conv= rgb32tobgr15; break; | ||
1608 | case 0x43: conv= rgb15tobgr16; break; | ||
1609 | case 0x44: conv= rgb16tobgr16; break; | ||
1610 | case 0x46: conv= rgb24tobgr16; break; | ||
1611 | case 0x48: conv= rgb32tobgr16; break; | ||
1612 | case 0x63: conv= rgb15tobgr24; break; | ||
1613 | case 0x64: conv= rgb16tobgr24; break; | ||
1614 | case 0x66: conv= rgb24tobgr24; break; | ||
1615 | case 0x68: conv= rgb32tobgr24; break; | ||
1616 | case 0x83: conv= rgb15tobgr32; break; | ||
1617 | case 0x84: conv= rgb16tobgr32; break; | ||
1618 | case 0x86: conv= rgb24tobgr32; break; | ||
1619 | case 0x88: conv= rgb32tobgr32; break; | ||
1620 | default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n", | ||
1621 | sws_format_name(srcFormat), sws_format_name(dstFormat)); break; | ||
1622 | } | ||
1623 | }else{ | ||
1624 | av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n", | ||
1625 | sws_format_name(srcFormat), sws_format_name(dstFormat)); | ||
1626 | } | ||
1627 | |||
1628 | if(conv) | ||
1629 | { | ||
1630 | if (dstStride[0]*srcBpp == srcStride[0]*dstBpp && srcStride[0] > 0) | ||
1631 | conv(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]); | ||
1632 | else | ||
1633 | { | ||
1634 | int i; | ||
1635 | uint8_t *srcPtr= src[0]; | ||
1636 | uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY; | ||
1637 | |||
1638 | for (i=0; i<srcSliceH; i++) | ||
1639 | { | ||
1640 | conv(srcPtr, dstPtr, c->srcW*srcBpp); | ||
1641 | srcPtr+= srcStride[0]; | ||
1642 | dstPtr+= dstStride[0]; | ||
1643 | } | ||
1644 | } | ||
1645 | } | ||
1646 | return srcSliceH; | ||
1647 | } | ||
1648 | |||
1649 | static int bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1650 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
1651 | |||
1652 | rgb24toyv12( | ||
1653 | src[0], | ||
1654 | dst[0]+ srcSliceY *dstStride[0], | ||
1655 | dst[1]+(srcSliceY>>1)*dstStride[1], | ||
1656 | dst[2]+(srcSliceY>>1)*dstStride[2], | ||
1657 | c->srcW, srcSliceH, | ||
1658 | dstStride[0], dstStride[1], srcStride[0]); | ||
1659 | return srcSliceH; | ||
1660 | } | ||
1661 | |||
1662 | static int yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1663 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
1664 | int i; | ||
1665 | |||
1666 | /* copy Y */ | ||
1667 | if (srcStride[0]==dstStride[0] && srcStride[0] > 0) | ||
1668 | memcpy(dst[0]+ srcSliceY*dstStride[0], src[0], srcStride[0]*srcSliceH); | ||
1669 | else{ | ||
1670 | uint8_t *srcPtr= src[0]; | ||
1671 | uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY; | ||
1672 | |||
1673 | for (i=0; i<srcSliceH; i++) | ||
1674 | { | ||
1675 | memcpy(dstPtr, srcPtr, c->srcW); | ||
1676 | srcPtr+= srcStride[0]; | ||
1677 | dstPtr+= dstStride[0]; | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | if (c->dstFormat==PIX_FMT_YUV420P){ | ||
1682 | planar2x(src[1], dst[1], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[1]); | ||
1683 | planar2x(src[2], dst[2], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[2]); | ||
1684 | }else{ | ||
1685 | planar2x(src[1], dst[2], c->chrSrcW, c->chrSrcH, srcStride[1], dstStride[2]); | ||
1686 | planar2x(src[2], dst[1], c->chrSrcW, c->chrSrcH, srcStride[2], dstStride[1]); | ||
1687 | } | ||
1688 | return srcSliceH; | ||
1689 | } | ||
1690 | |||
1691 | /* unscaled copy like stuff (assumes nearly identical formats) */ | ||
1692 | static int packedCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1693 | int srcSliceH, uint8_t* dst[], int dstStride[]) | ||
1694 | { | ||
1695 | if (dstStride[0]==srcStride[0] && srcStride[0] > 0) | ||
1696 | memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]); | ||
1697 | else | ||
1698 | { | ||
1699 | int i; | ||
1700 | uint8_t *srcPtr= src[0]; | ||
1701 | uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY; | ||
1702 | int length=0; | ||
1703 | |||
1704 | /* universal length finder */ | ||
1705 | while(length+c->srcW <= FFABS(dstStride[0]) | ||
1706 | && length+c->srcW <= FFABS(srcStride[0])) length+= c->srcW; | ||
1707 | assert(length!=0); | ||
1708 | |||
1709 | for (i=0; i<srcSliceH; i++) | ||
1710 | { | ||
1711 | memcpy(dstPtr, srcPtr, length); | ||
1712 | srcPtr+= srcStride[0]; | ||
1713 | dstPtr+= dstStride[0]; | ||
1714 | } | ||
1715 | } | ||
1716 | return srcSliceH; | ||
1717 | } | ||
1718 | |||
1719 | static int planarCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1720 | int srcSliceH, uint8_t* dst[], int dstStride[]) | ||
1721 | { | ||
1722 | int plane; | ||
1723 | for (plane=0; plane<3; plane++) | ||
1724 | { | ||
1725 | int length= plane==0 ? c->srcW : -((-c->srcW )>>c->chrDstHSubSample); | ||
1726 | int y= plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample); | ||
1727 | int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample); | ||
1728 | |||
1729 | if ((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0) | ||
1730 | { | ||
1731 | if (!isGray(c->dstFormat)) | ||
1732 | memset(dst[plane], 128, dstStride[plane]*height); | ||
1733 | } | ||
1734 | else | ||
1735 | { | ||
1736 | if (dstStride[plane]==srcStride[plane] && srcStride[plane] > 0) | ||
1737 | memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]); | ||
1738 | else | ||
1739 | { | ||
1740 | int i; | ||
1741 | uint8_t *srcPtr= src[plane]; | ||
1742 | uint8_t *dstPtr= dst[plane] + dstStride[plane]*y; | ||
1743 | for (i=0; i<height; i++) | ||
1744 | { | ||
1745 | memcpy(dstPtr, srcPtr, length); | ||
1746 | srcPtr+= srcStride[plane]; | ||
1747 | dstPtr+= dstStride[plane]; | ||
1748 | } | ||
1749 | } | ||
1750 | } | ||
1751 | } | ||
1752 | return srcSliceH; | ||
1753 | } | ||
1754 | |||
1755 | static int gray16togray(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1756 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
1757 | |||
1758 | int length= c->srcW; | ||
1759 | int y= srcSliceY; | ||
1760 | int height= srcSliceH; | ||
1761 | int i, j; | ||
1762 | uint8_t *srcPtr= src[0]; | ||
1763 | uint8_t *dstPtr= dst[0] + dstStride[0]*y; | ||
1764 | |||
1765 | if (!isGray(c->dstFormat)){ | ||
1766 | int height= -((-srcSliceH)>>c->chrDstVSubSample); | ||
1767 | memset(dst[1], 128, dstStride[1]*height); | ||
1768 | memset(dst[2], 128, dstStride[2]*height); | ||
1769 | } | ||
1770 | if (c->srcFormat == PIX_FMT_GRAY16LE) srcPtr++; | ||
1771 | for (i=0; i<height; i++) | ||
1772 | { | ||
1773 | for (j=0; j<length; j++) dstPtr[j] = srcPtr[j<<1]; | ||
1774 | srcPtr+= srcStride[0]; | ||
1775 | dstPtr+= dstStride[0]; | ||
1776 | } | ||
1777 | return srcSliceH; | ||
1778 | } | ||
1779 | |||
1780 | static int graytogray16(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1781 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
1782 | |||
1783 | int length= c->srcW; | ||
1784 | int y= srcSliceY; | ||
1785 | int height= srcSliceH; | ||
1786 | int i, j; | ||
1787 | uint8_t *srcPtr= src[0]; | ||
1788 | uint8_t *dstPtr= dst[0] + dstStride[0]*y; | ||
1789 | for (i=0; i<height; i++) | ||
1790 | { | ||
1791 | for (j=0; j<length; j++) | ||
1792 | { | ||
1793 | dstPtr[j<<1] = srcPtr[j]; | ||
1794 | dstPtr[(j<<1)+1] = srcPtr[j]; | ||
1795 | } | ||
1796 | srcPtr+= srcStride[0]; | ||
1797 | dstPtr+= dstStride[0]; | ||
1798 | } | ||
1799 | return srcSliceH; | ||
1800 | } | ||
1801 | |||
1802 | static int gray16swap(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
1803 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
1804 | |||
1805 | int length= c->srcW; | ||
1806 | int y= srcSliceY; | ||
1807 | int height= srcSliceH; | ||
1808 | int i, j; | ||
1809 | uint16_t *srcPtr= (uint16_t*)src[0]; | ||
1810 | uint16_t *dstPtr= (uint16_t*)(dst[0] + dstStride[0]*y/2); | ||
1811 | for (i=0; i<height; i++) | ||
1812 | { | ||
1813 | for (j=0; j<length; j++) dstPtr[j] = bswap_16(srcPtr[j]); | ||
1814 | srcPtr+= srcStride[0]/2; | ||
1815 | dstPtr+= dstStride[0]/2; | ||
1816 | } | ||
1817 | return srcSliceH; | ||
1818 | } | ||
1819 | |||
1820 | |||
1821 | static void getSubSampleFactors(int *h, int *v, int format){ | ||
1822 | switch(format){ | ||
1823 | case PIX_FMT_UYVY422: | ||
1824 | case PIX_FMT_YUYV422: | ||
1825 | *h=1; | ||
1826 | *v=0; | ||
1827 | break; | ||
1828 | case PIX_FMT_YUV420P: | ||
1829 | case PIX_FMT_YUVA420P: | ||
1830 | case PIX_FMT_GRAY16BE: | ||
1831 | case PIX_FMT_GRAY16LE: | ||
1832 | case PIX_FMT_GRAY8: //FIXME remove after different subsamplings are fully implemented | ||
1833 | case PIX_FMT_NV12: | ||
1834 | case PIX_FMT_NV21: | ||
1835 | *h=1; | ||
1836 | *v=1; | ||
1837 | break; | ||
1838 | case PIX_FMT_YUV440P: | ||
1839 | *h=0; | ||
1840 | *v=1; | ||
1841 | break; | ||
1842 | case PIX_FMT_YUV410P: | ||
1843 | *h=2; | ||
1844 | *v=2; | ||
1845 | break; | ||
1846 | case PIX_FMT_YUV444P: | ||
1847 | *h=0; | ||
1848 | *v=0; | ||
1849 | break; | ||
1850 | case PIX_FMT_YUV422P: | ||
1851 | *h=1; | ||
1852 | *v=0; | ||
1853 | break; | ||
1854 | case PIX_FMT_YUV411P: | ||
1855 | *h=2; | ||
1856 | *v=0; | ||
1857 | break; | ||
1858 | default: | ||
1859 | *h=0; | ||
1860 | *v=0; | ||
1861 | break; | ||
1862 | } | ||
1863 | } | ||
1864 | |||
1865 | static uint16_t roundToInt16(int64_t f){ | ||
1866 | int r= (f + (1<<15))>>16; | ||
1867 | if (r<-0x7FFF) return 0x8000; | ||
1868 | else if (r> 0x7FFF) return 0x7FFF; | ||
1869 | else return r; | ||
1870 | } | ||
1871 | |||
1872 | /** | ||
1873 | * @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x] | ||
1874 | * @param fullRange if 1 then the luma range is 0..255 if 0 it is 16..235 | ||
1875 | * @return -1 if not supported | ||
1876 | */ | ||
1877 | int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation){ | ||
1878 | int64_t crv = inv_table[0]; | ||
1879 | int64_t cbu = inv_table[1]; | ||
1880 | int64_t cgu = -inv_table[2]; | ||
1881 | int64_t cgv = -inv_table[3]; | ||
1882 | int64_t cy = 1<<16; | ||
1883 | int64_t oy = 0; | ||
1884 | |||
1885 | if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1; | ||
1886 | memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4); | ||
1887 | memcpy(c->dstColorspaceTable, table, sizeof(int)*4); | ||
1888 | |||
1889 | c->brightness= brightness; | ||
1890 | c->contrast = contrast; | ||
1891 | c->saturation= saturation; | ||
1892 | c->srcRange = srcRange; | ||
1893 | c->dstRange = dstRange; | ||
1894 | |||
1895 | c->uOffset= 0x0400040004000400LL; | ||
1896 | c->vOffset= 0x0400040004000400LL; | ||
1897 | |||
1898 | if (!srcRange){ | ||
1899 | cy= (cy*255) / 219; | ||
1900 | oy= 16<<16; | ||
1901 | }else{ | ||
1902 | crv= (crv*224) / 255; | ||
1903 | cbu= (cbu*224) / 255; | ||
1904 | cgu= (cgu*224) / 255; | ||
1905 | cgv= (cgv*224) / 255; | ||
1906 | } | ||
1907 | |||
1908 | cy = (cy *contrast )>>16; | ||
1909 | crv= (crv*contrast * saturation)>>32; | ||
1910 | cbu= (cbu*contrast * saturation)>>32; | ||
1911 | cgu= (cgu*contrast * saturation)>>32; | ||
1912 | cgv= (cgv*contrast * saturation)>>32; | ||
1913 | |||
1914 | oy -= 256*brightness; | ||
1915 | |||
1916 | c->yCoeff= roundToInt16(cy *8192) * 0x0001000100010001ULL; | ||
1917 | c->vrCoeff= roundToInt16(crv*8192) * 0x0001000100010001ULL; | ||
1918 | c->ubCoeff= roundToInt16(cbu*8192) * 0x0001000100010001ULL; | ||
1919 | c->vgCoeff= roundToInt16(cgv*8192) * 0x0001000100010001ULL; | ||
1920 | c->ugCoeff= roundToInt16(cgu*8192) * 0x0001000100010001ULL; | ||
1921 | c->yOffset= roundToInt16(oy * 8) * 0x0001000100010001ULL; | ||
1922 | |||
1923 | yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation); | ||
1924 | //FIXME factorize | ||
1925 | |||
1926 | #ifdef COMPILE_ALTIVEC | ||
1927 | if (c->flags & SWS_CPU_CAPS_ALTIVEC) | ||
1928 | yuv2rgb_altivec_init_tables (c, inv_table, brightness, contrast, saturation); | ||
1929 | #endif | ||
1930 | return 0; | ||
1931 | } | ||
1932 | |||
1933 | /** | ||
1934 | * @return -1 if not supported | ||
1935 | */ | ||
1936 | int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation){ | ||
1937 | if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1; | ||
1938 | |||
1939 | *inv_table = c->srcColorspaceTable; | ||
1940 | *table = c->dstColorspaceTable; | ||
1941 | *srcRange = c->srcRange; | ||
1942 | *dstRange = c->dstRange; | ||
1943 | *brightness= c->brightness; | ||
1944 | *contrast = c->contrast; | ||
1945 | *saturation= c->saturation; | ||
1946 | |||
1947 | return 0; | ||
1948 | } | ||
1949 | |||
1950 | static int handle_jpeg(int *format) | ||
1951 | { | ||
1952 | switch (*format) { | ||
1953 | case PIX_FMT_YUVJ420P: | ||
1954 | *format = PIX_FMT_YUV420P; | ||
1955 | return 1; | ||
1956 | case PIX_FMT_YUVJ422P: | ||
1957 | *format = PIX_FMT_YUV422P; | ||
1958 | return 1; | ||
1959 | case PIX_FMT_YUVJ444P: | ||
1960 | *format = PIX_FMT_YUV444P; | ||
1961 | return 1; | ||
1962 | case PIX_FMT_YUVJ440P: | ||
1963 | *format = PIX_FMT_YUV440P; | ||
1964 | return 1; | ||
1965 | default: | ||
1966 | return 0; | ||
1967 | } | ||
1968 | } | ||
1969 | |||
1970 | SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags, | ||
1971 | SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param){ | ||
1972 | |||
1973 | SwsContext *c; | ||
1974 | int i; | ||
1975 | int usesVFilter, usesHFilter; | ||
1976 | int unscaled, needsDither; | ||
1977 | int srcRange, dstRange; | ||
1978 | SwsFilter dummyFilter= {NULL, NULL, NULL, NULL}; | ||
1979 | #if defined(ARCH_X86) | ||
1980 | if (flags & SWS_CPU_CAPS_MMX) | ||
1981 | asm volatile("emms\n\t"::: "memory"); | ||
1982 | #endif | ||
1983 | |||
1984 | #if !defined(RUNTIME_CPUDETECT) || !defined (CONFIG_GPL) //ensure that the flags match the compiled variant if cpudetect is off | ||
1985 | flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN); | ||
1986 | #ifdef HAVE_MMX2 | ||
1987 | flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2; | ||
1988 | #elif defined (HAVE_3DNOW) | ||
1989 | flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW; | ||
1990 | #elif defined (HAVE_MMX) | ||
1991 | flags |= SWS_CPU_CAPS_MMX; | ||
1992 | #elif defined (HAVE_ALTIVEC) | ||
1993 | flags |= SWS_CPU_CAPS_ALTIVEC; | ||
1994 | #elif defined (ARCH_BFIN) | ||
1995 | flags |= SWS_CPU_CAPS_BFIN; | ||
1996 | #endif | ||
1997 | #endif /* RUNTIME_CPUDETECT */ | ||
1998 | if (clip_table[512] != 255) globalInit(); | ||
1999 | if (!rgb15to16) sws_rgb2rgb_init(flags); | ||
2000 | |||
2001 | unscaled = (srcW == dstW && srcH == dstH); | ||
2002 | needsDither= (isBGR(dstFormat) || isRGB(dstFormat)) | ||
2003 | && (fmt_depth(dstFormat))<24 | ||
2004 | && ((fmt_depth(dstFormat))<(fmt_depth(srcFormat)) || (!(isRGB(srcFormat) || isBGR(srcFormat)))); | ||
2005 | |||
2006 | srcRange = handle_jpeg(&srcFormat); | ||
2007 | dstRange = handle_jpeg(&dstFormat); | ||
2008 | |||
2009 | if (!isSupportedIn(srcFormat)) | ||
2010 | { | ||
2011 | av_log(NULL, AV_LOG_ERROR, "swScaler: %s is not supported as input pixel format\n", sws_format_name(srcFormat)); | ||
2012 | return NULL; | ||
2013 | } | ||
2014 | if (!isSupportedOut(dstFormat)) | ||
2015 | { | ||
2016 | av_log(NULL, AV_LOG_ERROR, "swScaler: %s is not supported as output pixel format\n", sws_format_name(dstFormat)); | ||
2017 | return NULL; | ||
2018 | } | ||
2019 | |||
2020 | i= flags & ( SWS_POINT | ||
2021 | |SWS_AREA | ||
2022 | |SWS_BILINEAR | ||
2023 | |SWS_FAST_BILINEAR | ||
2024 | |SWS_BICUBIC | ||
2025 | |SWS_X | ||
2026 | |SWS_GAUSS | ||
2027 | |SWS_LANCZOS | ||
2028 | |SWS_SINC | ||
2029 | |SWS_SPLINE | ||
2030 | |SWS_BICUBLIN); | ||
2031 | if(!i || (i & (i-1))) | ||
2032 | { | ||
2033 | av_log(NULL, AV_LOG_ERROR, "swScaler: Exactly one scaler algorithm must be choosen\n"); | ||
2034 | return NULL; | ||
2035 | } | ||
2036 | |||
2037 | |||
2038 | /* sanity check */ | ||
2039 | if (srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code | ||
2040 | { | ||
2041 | av_log(NULL, AV_LOG_ERROR, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", | ||
2042 | srcW, srcH, dstW, dstH); | ||
2043 | return NULL; | ||
2044 | } | ||
2045 | if(srcW > VOFW || dstW > VOFW){ | ||
2046 | av_log(NULL, AV_LOG_ERROR, "swScaler: Compile time max width is "AV_STRINGIFY(VOFW)" change VOF/VOFW and recompile\n"); | ||
2047 | return NULL; | ||
2048 | } | ||
2049 | |||
2050 | if (!dstFilter) dstFilter= &dummyFilter; | ||
2051 | if (!srcFilter) srcFilter= &dummyFilter; | ||
2052 | |||
2053 | c= av_mallocz(sizeof(SwsContext)); | ||
2054 | |||
2055 | c->av_class = &sws_context_class; | ||
2056 | c->srcW= srcW; | ||
2057 | c->srcH= srcH; | ||
2058 | c->dstW= dstW; | ||
2059 | c->dstH= dstH; | ||
2060 | c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW; | ||
2061 | c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH; | ||
2062 | c->flags= flags; | ||
2063 | c->dstFormat= dstFormat; | ||
2064 | c->srcFormat= srcFormat; | ||
2065 | c->vRounder= 4* 0x0001000100010001ULL; | ||
2066 | |||
2067 | usesHFilter= usesVFilter= 0; | ||
2068 | if (dstFilter->lumV && dstFilter->lumV->length>1) usesVFilter=1; | ||
2069 | if (dstFilter->lumH && dstFilter->lumH->length>1) usesHFilter=1; | ||
2070 | if (dstFilter->chrV && dstFilter->chrV->length>1) usesVFilter=1; | ||
2071 | if (dstFilter->chrH && dstFilter->chrH->length>1) usesHFilter=1; | ||
2072 | if (srcFilter->lumV && srcFilter->lumV->length>1) usesVFilter=1; | ||
2073 | if (srcFilter->lumH && srcFilter->lumH->length>1) usesHFilter=1; | ||
2074 | if (srcFilter->chrV && srcFilter->chrV->length>1) usesVFilter=1; | ||
2075 | if (srcFilter->chrH && srcFilter->chrH->length>1) usesHFilter=1; | ||
2076 | |||
2077 | getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat); | ||
2078 | getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat); | ||
2079 | |||
2080 | // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation | ||
2081 | if ((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1; | ||
2082 | |||
2083 | // drop some chroma lines if the user wants it | ||
2084 | c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT; | ||
2085 | c->chrSrcVSubSample+= c->vChrDrop; | ||
2086 | |||
2087 | // drop every 2. pixel for chroma calculation unless user wants full chroma | ||
2088 | if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP) | ||
2089 | && srcFormat!=PIX_FMT_RGB8 && srcFormat!=PIX_FMT_BGR8 | ||
2090 | && srcFormat!=PIX_FMT_RGB4 && srcFormat!=PIX_FMT_BGR4 | ||
2091 | && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE) | ||
2092 | c->chrSrcHSubSample=1; | ||
2093 | |||
2094 | if (param){ | ||
2095 | c->param[0] = param[0]; | ||
2096 | c->param[1] = param[1]; | ||
2097 | }else{ | ||
2098 | c->param[0] = | ||
2099 | c->param[1] = SWS_PARAM_DEFAULT; | ||
2100 | } | ||
2101 | |||
2102 | c->chrIntHSubSample= c->chrDstHSubSample; | ||
2103 | c->chrIntVSubSample= c->chrSrcVSubSample; | ||
2104 | |||
2105 | // Note the -((-x)>>y) is so that we always round toward +inf. | ||
2106 | c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample); | ||
2107 | c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample); | ||
2108 | c->chrDstW= -((-dstW) >> c->chrDstHSubSample); | ||
2109 | c->chrDstH= -((-dstH) >> c->chrDstVSubSample); | ||
2110 | |||
2111 | sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], srcRange, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, dstRange, 0, 1<<16, 1<<16); | ||
2112 | |||
2113 | /* unscaled special Cases */ | ||
2114 | if (unscaled && !usesHFilter && !usesVFilter) | ||
2115 | { | ||
2116 | /* yv12_to_nv12 */ | ||
2117 | if (srcFormat == PIX_FMT_YUV420P && (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)) | ||
2118 | { | ||
2119 | c->swScale= PlanarToNV12Wrapper; | ||
2120 | } | ||
2121 | #ifdef CONFIG_GPL | ||
2122 | /* yuv2bgr */ | ||
2123 | if ((srcFormat==PIX_FMT_YUV420P || srcFormat==PIX_FMT_YUV422P) && (isBGR(dstFormat) || isRGB(dstFormat))) | ||
2124 | { | ||
2125 | c->swScale= yuv2rgb_get_func_ptr(c); | ||
2126 | } | ||
2127 | #endif | ||
2128 | |||
2129 | if (srcFormat==PIX_FMT_YUV410P && dstFormat==PIX_FMT_YUV420P) | ||
2130 | { | ||
2131 | c->swScale= yvu9toyv12Wrapper; | ||
2132 | } | ||
2133 | |||
2134 | /* bgr24toYV12 */ | ||
2135 | if (srcFormat==PIX_FMT_BGR24 && dstFormat==PIX_FMT_YUV420P) | ||
2136 | c->swScale= bgr24toyv12Wrapper; | ||
2137 | |||
2138 | /* rgb/bgr -> rgb/bgr (no dither needed forms) */ | ||
2139 | if ( (isBGR(srcFormat) || isRGB(srcFormat)) | ||
2140 | && (isBGR(dstFormat) || isRGB(dstFormat)) | ||
2141 | && srcFormat != PIX_FMT_BGR8 && dstFormat != PIX_FMT_BGR8 | ||
2142 | && srcFormat != PIX_FMT_RGB8 && dstFormat != PIX_FMT_RGB8 | ||
2143 | && srcFormat != PIX_FMT_BGR4 && dstFormat != PIX_FMT_BGR4 | ||
2144 | && srcFormat != PIX_FMT_RGB4 && dstFormat != PIX_FMT_RGB4 | ||
2145 | && srcFormat != PIX_FMT_BGR4_BYTE && dstFormat != PIX_FMT_BGR4_BYTE | ||
2146 | && srcFormat != PIX_FMT_RGB4_BYTE && dstFormat != PIX_FMT_RGB4_BYTE | ||
2147 | && srcFormat != PIX_FMT_MONOBLACK && dstFormat != PIX_FMT_MONOBLACK | ||
2148 | && !needsDither) | ||
2149 | c->swScale= rgb2rgbWrapper; | ||
2150 | |||
2151 | /* LQ converters if -sws 0 or -sws 4*/ | ||
2152 | if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)){ | ||
2153 | /* rgb/bgr -> rgb/bgr (dither needed forms) */ | ||
2154 | if ( (isBGR(srcFormat) || isRGB(srcFormat)) | ||
2155 | && (isBGR(dstFormat) || isRGB(dstFormat)) | ||
2156 | && needsDither) | ||
2157 | c->swScale= rgb2rgbWrapper; | ||
2158 | |||
2159 | /* yv12_to_yuy2 */ | ||
2160 | if (srcFormat == PIX_FMT_YUV420P && | ||
2161 | (dstFormat == PIX_FMT_YUYV422 || dstFormat == PIX_FMT_UYVY422)) | ||
2162 | { | ||
2163 | if (dstFormat == PIX_FMT_YUYV422) | ||
2164 | c->swScale= PlanarToYuy2Wrapper; | ||
2165 | else | ||
2166 | c->swScale= PlanarToUyvyWrapper; | ||
2167 | } | ||
2168 | } | ||
2169 | |||
2170 | #ifdef COMPILE_ALTIVEC | ||
2171 | if ((c->flags & SWS_CPU_CAPS_ALTIVEC) && | ||
2172 | ((srcFormat == PIX_FMT_YUV420P && | ||
2173 | (dstFormat == PIX_FMT_YUYV422 || dstFormat == PIX_FMT_UYVY422)))) { | ||
2174 | // unscaled YV12 -> packed YUV, we want speed | ||
2175 | if (dstFormat == PIX_FMT_YUYV422) | ||
2176 | c->swScale= yv12toyuy2_unscaled_altivec; | ||
2177 | else | ||
2178 | c->swScale= yv12touyvy_unscaled_altivec; | ||
2179 | } | ||
2180 | #endif | ||
2181 | |||
2182 | /* simple copy */ | ||
2183 | if ( srcFormat == dstFormat | ||
2184 | || (isPlanarYUV(srcFormat) && isGray(dstFormat)) | ||
2185 | || (isPlanarYUV(dstFormat) && isGray(srcFormat))) | ||
2186 | { | ||
2187 | if (isPacked(c->srcFormat)) | ||
2188 | c->swScale= packedCopy; | ||
2189 | else /* Planar YUV or gray */ | ||
2190 | c->swScale= planarCopy; | ||
2191 | } | ||
2192 | |||
2193 | /* gray16{le,be} conversions */ | ||
2194 | if (isGray16(srcFormat) && (isPlanarYUV(dstFormat) || (dstFormat == PIX_FMT_GRAY8))) | ||
2195 | { | ||
2196 | c->swScale= gray16togray; | ||
2197 | } | ||
2198 | if ((isPlanarYUV(srcFormat) || (srcFormat == PIX_FMT_GRAY8)) && isGray16(dstFormat)) | ||
2199 | { | ||
2200 | c->swScale= graytogray16; | ||
2201 | } | ||
2202 | if (srcFormat != dstFormat && isGray16(srcFormat) && isGray16(dstFormat)) | ||
2203 | { | ||
2204 | c->swScale= gray16swap; | ||
2205 | } | ||
2206 | |||
2207 | #ifdef ARCH_BFIN | ||
2208 | if (flags & SWS_CPU_CAPS_BFIN) | ||
2209 | ff_bfin_get_unscaled_swscale (c); | ||
2210 | #endif | ||
2211 | |||
2212 | if (c->swScale){ | ||
2213 | if (flags&SWS_PRINT_INFO) | ||
2214 | av_log(c, AV_LOG_INFO, "using unscaled %s -> %s special converter\n", | ||
2215 | sws_format_name(srcFormat), sws_format_name(dstFormat)); | ||
2216 | return c; | ||
2217 | } | ||
2218 | } | ||
2219 | |||
2220 | if (flags & SWS_CPU_CAPS_MMX2) | ||
2221 | { | ||
2222 | c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0; | ||
2223 | if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) | ||
2224 | { | ||
2225 | if (flags&SWS_PRINT_INFO) | ||
2226 | av_log(c, AV_LOG_INFO, "output Width is not a multiple of 32 -> no MMX2 scaler\n"); | ||
2227 | } | ||
2228 | if (usesHFilter) c->canMMX2BeUsed=0; | ||
2229 | } | ||
2230 | else | ||
2231 | c->canMMX2BeUsed=0; | ||
2232 | |||
2233 | c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW; | ||
2234 | c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH; | ||
2235 | |||
2236 | // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst | ||
2237 | // but only for the FAST_BILINEAR mode otherwise do correct scaling | ||
2238 | // n-2 is the last chrominance sample available | ||
2239 | // this is not perfect, but no one should notice the difference, the more correct variant | ||
2240 | // would be like the vertical one, but that would require some special code for the | ||
2241 | // first and last pixel | ||
2242 | if (flags&SWS_FAST_BILINEAR) | ||
2243 | { | ||
2244 | if (c->canMMX2BeUsed) | ||
2245 | { | ||
2246 | c->lumXInc+= 20; | ||
2247 | c->chrXInc+= 20; | ||
2248 | } | ||
2249 | //we don't use the x86asm scaler if mmx is available | ||
2250 | else if (flags & SWS_CPU_CAPS_MMX) | ||
2251 | { | ||
2252 | c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20; | ||
2253 | c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20; | ||
2254 | } | ||
2255 | } | ||
2256 | |||
2257 | /* precalculate horizontal scaler filter coefficients */ | ||
2258 | { | ||
2259 | const int filterAlign= | ||
2260 | (flags & SWS_CPU_CAPS_MMX) ? 4 : | ||
2261 | (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 : | ||
2262 | 1; | ||
2263 | |||
2264 | initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc, | ||
2265 | srcW , dstW, filterAlign, 1<<14, | ||
2266 | (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags, | ||
2267 | srcFilter->lumH, dstFilter->lumH, c->param); | ||
2268 | initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, | ||
2269 | c->chrSrcW, c->chrDstW, filterAlign, 1<<14, | ||
2270 | (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, | ||
2271 | srcFilter->chrH, dstFilter->chrH, c->param); | ||
2272 | |||
2273 | #define MAX_FUNNY_CODE_SIZE 10000 | ||
2274 | #if defined(COMPILE_MMX2) | ||
2275 | // can't downscale !!! | ||
2276 | if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) | ||
2277 | { | ||
2278 | #ifdef MAP_ANONYMOUS | ||
2279 | c->funnyYCode = (uint8_t*)mmap(NULL, MAX_FUNNY_CODE_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); | ||
2280 | c->funnyUVCode = (uint8_t*)mmap(NULL, MAX_FUNNY_CODE_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); | ||
2281 | #else | ||
2282 | c->funnyYCode = av_malloc(MAX_FUNNY_CODE_SIZE); | ||
2283 | c->funnyUVCode = av_malloc(MAX_FUNNY_CODE_SIZE); | ||
2284 | #endif | ||
2285 | |||
2286 | c->lumMmx2Filter = av_malloc((dstW /8+8)*sizeof(int16_t)); | ||
2287 | c->chrMmx2Filter = av_malloc((c->chrDstW /4+8)*sizeof(int16_t)); | ||
2288 | c->lumMmx2FilterPos= av_malloc((dstW /2/8+8)*sizeof(int32_t)); | ||
2289 | c->chrMmx2FilterPos= av_malloc((c->chrDstW/2/4+8)*sizeof(int32_t)); | ||
2290 | |||
2291 | initMMX2HScaler( dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8); | ||
2292 | initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4); | ||
2293 | } | ||
2294 | #endif /* defined(COMPILE_MMX2) */ | ||
2295 | } // Init Horizontal stuff | ||
2296 | |||
2297 | |||
2298 | |||
2299 | /* precalculate vertical scaler filter coefficients */ | ||
2300 | { | ||
2301 | const int filterAlign= | ||
2302 | (flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 : | ||
2303 | (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 : | ||
2304 | 1; | ||
2305 | |||
2306 | initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc, | ||
2307 | srcH , dstH, filterAlign, (1<<12)-4, | ||
2308 | (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC) : flags, | ||
2309 | srcFilter->lumV, dstFilter->lumV, c->param); | ||
2310 | initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc, | ||
2311 | c->chrSrcH, c->chrDstH, filterAlign, (1<<12)-4, | ||
2312 | (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, | ||
2313 | srcFilter->chrV, dstFilter->chrV, c->param); | ||
2314 | |||
2315 | #ifdef HAVE_ALTIVEC | ||
2316 | c->vYCoeffsBank = av_malloc(sizeof (vector signed short)*c->vLumFilterSize*c->dstH); | ||
2317 | c->vCCoeffsBank = av_malloc(sizeof (vector signed short)*c->vChrFilterSize*c->chrDstH); | ||
2318 | |||
2319 | for (i=0;i<c->vLumFilterSize*c->dstH;i++) { | ||
2320 | int j; | ||
2321 | short *p = (short *)&c->vYCoeffsBank[i]; | ||
2322 | for (j=0;j<8;j++) | ||
2323 | p[j] = c->vLumFilter[i]; | ||
2324 | } | ||
2325 | |||
2326 | for (i=0;i<c->vChrFilterSize*c->chrDstH;i++) { | ||
2327 | int j; | ||
2328 | short *p = (short *)&c->vCCoeffsBank[i]; | ||
2329 | for (j=0;j<8;j++) | ||
2330 | p[j] = c->vChrFilter[i]; | ||
2331 | } | ||
2332 | #endif | ||
2333 | } | ||
2334 | |||
2335 | // Calculate Buffer Sizes so that they won't run out while handling these damn slices | ||
2336 | c->vLumBufSize= c->vLumFilterSize; | ||
2337 | c->vChrBufSize= c->vChrFilterSize; | ||
2338 | for (i=0; i<dstH; i++) | ||
2339 | { | ||
2340 | int chrI= i*c->chrDstH / dstH; | ||
2341 | int nextSlice= FFMAX(c->vLumFilterPos[i ] + c->vLumFilterSize - 1, | ||
2342 | ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample)); | ||
2343 | |||
2344 | nextSlice>>= c->chrSrcVSubSample; | ||
2345 | nextSlice<<= c->chrSrcVSubSample; | ||
2346 | if (c->vLumFilterPos[i ] + c->vLumBufSize < nextSlice) | ||
2347 | c->vLumBufSize= nextSlice - c->vLumFilterPos[i]; | ||
2348 | if (c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample)) | ||
2349 | c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI]; | ||
2350 | } | ||
2351 | |||
2352 | // allocate pixbufs (we use dynamic allocation because otherwise we would need to | ||
2353 | c->lumPixBuf= av_malloc(c->vLumBufSize*2*sizeof(int16_t*)); | ||
2354 | c->chrPixBuf= av_malloc(c->vChrBufSize*2*sizeof(int16_t*)); | ||
2355 | //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000) | ||
2356 | /* align at 16 bytes for AltiVec */ | ||
2357 | for (i=0; i<c->vLumBufSize; i++) | ||
2358 | c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= av_mallocz(VOF+1); | ||
2359 | for (i=0; i<c->vChrBufSize; i++) | ||
2360 | c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= av_malloc((VOF+1)*2); | ||
2361 | |||
2362 | //try to avoid drawing green stuff between the right end and the stride end | ||
2363 | for (i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, (VOF+1)*2); | ||
2364 | |||
2365 | assert(2*VOFW == VOF); | ||
2366 | |||
2367 | assert(c->chrDstH <= dstH); | ||
2368 | |||
2369 | if (flags&SWS_PRINT_INFO) | ||
2370 | { | ||
2371 | #ifdef DITHER1XBPP | ||
2372 | const char *dither= " dithered"; | ||
2373 | #else | ||
2374 | const char *dither= ""; | ||
2375 | #endif | ||
2376 | if (flags&SWS_FAST_BILINEAR) | ||
2377 | av_log(c, AV_LOG_INFO, "FAST_BILINEAR scaler, "); | ||
2378 | else if (flags&SWS_BILINEAR) | ||
2379 | av_log(c, AV_LOG_INFO, "BILINEAR scaler, "); | ||
2380 | else if (flags&SWS_BICUBIC) | ||
2381 | av_log(c, AV_LOG_INFO, "BICUBIC scaler, "); | ||
2382 | else if (flags&SWS_X) | ||
2383 | av_log(c, AV_LOG_INFO, "Experimental scaler, "); | ||
2384 | else if (flags&SWS_POINT) | ||
2385 | av_log(c, AV_LOG_INFO, "Nearest Neighbor / POINT scaler, "); | ||
2386 | else if (flags&SWS_AREA) | ||
2387 | av_log(c, AV_LOG_INFO, "Area Averageing scaler, "); | ||
2388 | else if (flags&SWS_BICUBLIN) | ||
2389 | av_log(c, AV_LOG_INFO, "luma BICUBIC / chroma BILINEAR scaler, "); | ||
2390 | else if (flags&SWS_GAUSS) | ||
2391 | av_log(c, AV_LOG_INFO, "Gaussian scaler, "); | ||
2392 | else if (flags&SWS_SINC) | ||
2393 | av_log(c, AV_LOG_INFO, "Sinc scaler, "); | ||
2394 | else if (flags&SWS_LANCZOS) | ||
2395 | av_log(c, AV_LOG_INFO, "Lanczos scaler, "); | ||
2396 | else if (flags&SWS_SPLINE) | ||
2397 | av_log(c, AV_LOG_INFO, "Bicubic spline scaler, "); | ||
2398 | else | ||
2399 | av_log(c, AV_LOG_INFO, "ehh flags invalid?! "); | ||
2400 | |||
2401 | if (dstFormat==PIX_FMT_BGR555 || dstFormat==PIX_FMT_BGR565) | ||
2402 | av_log(c, AV_LOG_INFO, "from %s to%s %s ", | ||
2403 | sws_format_name(srcFormat), dither, sws_format_name(dstFormat)); | ||
2404 | else | ||
2405 | av_log(c, AV_LOG_INFO, "from %s to %s ", | ||
2406 | sws_format_name(srcFormat), sws_format_name(dstFormat)); | ||
2407 | |||
2408 | if (flags & SWS_CPU_CAPS_MMX2) | ||
2409 | av_log(c, AV_LOG_INFO, "using MMX2\n"); | ||
2410 | else if (flags & SWS_CPU_CAPS_3DNOW) | ||
2411 | av_log(c, AV_LOG_INFO, "using 3DNOW\n"); | ||
2412 | else if (flags & SWS_CPU_CAPS_MMX) | ||
2413 | av_log(c, AV_LOG_INFO, "using MMX\n"); | ||
2414 | else if (flags & SWS_CPU_CAPS_ALTIVEC) | ||
2415 | av_log(c, AV_LOG_INFO, "using AltiVec\n"); | ||
2416 | else | ||
2417 | av_log(c, AV_LOG_INFO, "using C\n"); | ||
2418 | } | ||
2419 | |||
2420 | if (flags & SWS_PRINT_INFO) | ||
2421 | { | ||
2422 | if (flags & SWS_CPU_CAPS_MMX) | ||
2423 | { | ||
2424 | if (c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR)) | ||
2425 | av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR MMX2 scaler for horizontal scaling\n"); | ||
2426 | else | ||
2427 | { | ||
2428 | if (c->hLumFilterSize==4) | ||
2429 | av_log(c, AV_LOG_VERBOSE, "using 4-tap MMX scaler for horizontal luminance scaling\n"); | ||
2430 | else if (c->hLumFilterSize==8) | ||
2431 | av_log(c, AV_LOG_VERBOSE, "using 8-tap MMX scaler for horizontal luminance scaling\n"); | ||
2432 | else | ||
2433 | av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal luminance scaling\n"); | ||
2434 | |||
2435 | if (c->hChrFilterSize==4) | ||
2436 | av_log(c, AV_LOG_VERBOSE, "using 4-tap MMX scaler for horizontal chrominance scaling\n"); | ||
2437 | else if (c->hChrFilterSize==8) | ||
2438 | av_log(c, AV_LOG_VERBOSE, "using 8-tap MMX scaler for horizontal chrominance scaling\n"); | ||
2439 | else | ||
2440 | av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal chrominance scaling\n"); | ||
2441 | } | ||
2442 | } | ||
2443 | else | ||
2444 | { | ||
2445 | #if defined(ARCH_X86) | ||
2446 | av_log(c, AV_LOG_VERBOSE, "using X86-Asm scaler for horizontal scaling\n"); | ||
2447 | #else | ||
2448 | if (flags & SWS_FAST_BILINEAR) | ||
2449 | av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR C scaler for horizontal scaling\n"); | ||
2450 | else | ||
2451 | av_log(c, AV_LOG_VERBOSE, "using C scaler for horizontal scaling\n"); | ||
2452 | #endif | ||
2453 | } | ||
2454 | if (isPlanarYUV(dstFormat)) | ||
2455 | { | ||
2456 | if (c->vLumFilterSize==1) | ||
2457 | av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2458 | else | ||
2459 | av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2460 | } | ||
2461 | else | ||
2462 | { | ||
2463 | if (c->vLumFilterSize==1 && c->vChrFilterSize==2) | ||
2464 | av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n" | ||
2465 | " 2-tap scaler for vertical chrominance scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2466 | else if (c->vLumFilterSize==2 && c->vChrFilterSize==2) | ||
2467 | av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2468 | else | ||
2469 | av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2470 | } | ||
2471 | |||
2472 | if (dstFormat==PIX_FMT_BGR24) | ||
2473 | av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 Converter\n", | ||
2474 | (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C")); | ||
2475 | else if (dstFormat==PIX_FMT_RGB32) | ||
2476 | av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2477 | else if (dstFormat==PIX_FMT_BGR565) | ||
2478 | av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2479 | else if (dstFormat==PIX_FMT_BGR555) | ||
2480 | av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"); | ||
2481 | |||
2482 | av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH); | ||
2483 | } | ||
2484 | if (flags & SWS_PRINT_INFO) | ||
2485 | { | ||
2486 | av_log(c, AV_LOG_DEBUG, "Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n", | ||
2487 | c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc); | ||
2488 | av_log(c, AV_LOG_DEBUG, "Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n", | ||
2489 | c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc); | ||
2490 | } | ||
2491 | |||
2492 | c->swScale= getSwsFunc(flags); | ||
2493 | return c; | ||
2494 | } | ||
2495 | |||
2496 | /** | ||
2497 | * swscale wrapper, so we don't need to export the SwsContext. | ||
2498 | * assumes planar YUV to be in YUV order instead of YVU | ||
2499 | */ | ||
2500 | int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
2501 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
2502 | int i; | ||
2503 | uint8_t* src2[4]= {src[0], src[1], src[2]}; | ||
2504 | uint32_t pal[256]; | ||
2505 | if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) { | ||
2506 | av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n"); | ||
2507 | return 0; | ||
2508 | } | ||
2509 | if (c->sliceDir == 0) { | ||
2510 | if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1; | ||
2511 | } | ||
2512 | |||
2513 | if (c->srcFormat == PIX_FMT_PAL8){ | ||
2514 | for (i=0; i<256; i++){ | ||
2515 | int p= ((uint32_t*)(src[1]))[i]; | ||
2516 | int r= (p>>16)&0xFF; | ||
2517 | int g= (p>> 8)&0xFF; | ||
2518 | int b= p &0xFF; | ||
2519 | int y= av_clip_uint8(((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16 ); | ||
2520 | int u= av_clip_uint8(((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128); | ||
2521 | int v= av_clip_uint8(((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128); | ||
2522 | pal[i]= y + (u<<8) + (v<<16); | ||
2523 | } | ||
2524 | src2[1]= (uint8_t*)pal; | ||
2525 | } | ||
2526 | |||
2527 | // copy strides, so they can safely be modified | ||
2528 | if (c->sliceDir == 1) { | ||
2529 | // slices go from top to bottom | ||
2530 | int srcStride2[4]= {srcStride[0], srcStride[1], srcStride[2]}; | ||
2531 | int dstStride2[4]= {dstStride[0], dstStride[1], dstStride[2]}; | ||
2532 | return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst, dstStride2); | ||
2533 | } else { | ||
2534 | // slices go from bottom to top => we flip the image internally | ||
2535 | uint8_t* dst2[4]= {dst[0] + (c->dstH-1)*dstStride[0], | ||
2536 | dst[1] + ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[1], | ||
2537 | dst[2] + ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[2]}; | ||
2538 | int srcStride2[4]= {-srcStride[0], -srcStride[1], -srcStride[2]}; | ||
2539 | int dstStride2[4]= {-dstStride[0], -dstStride[1], -dstStride[2]}; | ||
2540 | |||
2541 | src2[0] += (srcSliceH-1)*srcStride[0]; | ||
2542 | if (c->srcFormat != PIX_FMT_PAL8) | ||
2543 | src2[1] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[1]; | ||
2544 | src2[2] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[2]; | ||
2545 | |||
2546 | return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH, srcSliceH, dst2, dstStride2); | ||
2547 | } | ||
2548 | } | ||
2549 | |||
2550 | /** | ||
2551 | * swscale wrapper, so we don't need to export the SwsContext | ||
2552 | */ | ||
2553 | int sws_scale_ordered(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
2554 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
2555 | return sws_scale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride); | ||
2556 | } | ||
2557 | |||
2558 | SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur, | ||
2559 | float lumaSharpen, float chromaSharpen, | ||
2560 | float chromaHShift, float chromaVShift, | ||
2561 | int verbose) | ||
2562 | { | ||
2563 | SwsFilter *filter= av_malloc(sizeof(SwsFilter)); | ||
2564 | |||
2565 | if (lumaGBlur!=0.0){ | ||
2566 | filter->lumH= sws_getGaussianVec(lumaGBlur, 3.0); | ||
2567 | filter->lumV= sws_getGaussianVec(lumaGBlur, 3.0); | ||
2568 | }else{ | ||
2569 | filter->lumH= sws_getIdentityVec(); | ||
2570 | filter->lumV= sws_getIdentityVec(); | ||
2571 | } | ||
2572 | |||
2573 | if (chromaGBlur!=0.0){ | ||
2574 | filter->chrH= sws_getGaussianVec(chromaGBlur, 3.0); | ||
2575 | filter->chrV= sws_getGaussianVec(chromaGBlur, 3.0); | ||
2576 | }else{ | ||
2577 | filter->chrH= sws_getIdentityVec(); | ||
2578 | filter->chrV= sws_getIdentityVec(); | ||
2579 | } | ||
2580 | |||
2581 | if (chromaSharpen!=0.0){ | ||
2582 | SwsVector *id= sws_getIdentityVec(); | ||
2583 | sws_scaleVec(filter->chrH, -chromaSharpen); | ||
2584 | sws_scaleVec(filter->chrV, -chromaSharpen); | ||
2585 | sws_addVec(filter->chrH, id); | ||
2586 | sws_addVec(filter->chrV, id); | ||
2587 | sws_freeVec(id); | ||
2588 | } | ||
2589 | |||
2590 | if (lumaSharpen!=0.0){ | ||
2591 | SwsVector *id= sws_getIdentityVec(); | ||
2592 | sws_scaleVec(filter->lumH, -lumaSharpen); | ||
2593 | sws_scaleVec(filter->lumV, -lumaSharpen); | ||
2594 | sws_addVec(filter->lumH, id); | ||
2595 | sws_addVec(filter->lumV, id); | ||
2596 | sws_freeVec(id); | ||
2597 | } | ||
2598 | |||
2599 | if (chromaHShift != 0.0) | ||
2600 | sws_shiftVec(filter->chrH, (int)(chromaHShift+0.5)); | ||
2601 | |||
2602 | if (chromaVShift != 0.0) | ||
2603 | sws_shiftVec(filter->chrV, (int)(chromaVShift+0.5)); | ||
2604 | |||
2605 | sws_normalizeVec(filter->chrH, 1.0); | ||
2606 | sws_normalizeVec(filter->chrV, 1.0); | ||
2607 | sws_normalizeVec(filter->lumH, 1.0); | ||
2608 | sws_normalizeVec(filter->lumV, 1.0); | ||
2609 | |||
2610 | if (verbose) sws_printVec(filter->chrH); | ||
2611 | if (verbose) sws_printVec(filter->lumH); | ||
2612 | |||
2613 | return filter; | ||
2614 | } | ||
2615 | |||
2616 | /** | ||
2617 | * returns a normalized gaussian curve used to filter stuff | ||
2618 | * quality=3 is high quality, lowwer is lowwer quality | ||
2619 | */ | ||
2620 | SwsVector *sws_getGaussianVec(double variance, double quality){ | ||
2621 | const int length= (int)(variance*quality + 0.5) | 1; | ||
2622 | int i; | ||
2623 | double *coeff= av_malloc(length*sizeof(double)); | ||
2624 | double middle= (length-1)*0.5; | ||
2625 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2626 | |||
2627 | vec->coeff= coeff; | ||
2628 | vec->length= length; | ||
2629 | |||
2630 | for (i=0; i<length; i++) | ||
2631 | { | ||
2632 | double dist= i-middle; | ||
2633 | coeff[i]= exp(-dist*dist/(2*variance*variance)) / sqrt(2*variance*PI); | ||
2634 | } | ||
2635 | |||
2636 | sws_normalizeVec(vec, 1.0); | ||
2637 | |||
2638 | return vec; | ||
2639 | } | ||
2640 | |||
2641 | SwsVector *sws_getConstVec(double c, int length){ | ||
2642 | int i; | ||
2643 | double *coeff= av_malloc(length*sizeof(double)); | ||
2644 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2645 | |||
2646 | vec->coeff= coeff; | ||
2647 | vec->length= length; | ||
2648 | |||
2649 | for (i=0; i<length; i++) | ||
2650 | coeff[i]= c; | ||
2651 | |||
2652 | return vec; | ||
2653 | } | ||
2654 | |||
2655 | |||
2656 | SwsVector *sws_getIdentityVec(void){ | ||
2657 | return sws_getConstVec(1.0, 1); | ||
2658 | } | ||
2659 | |||
2660 | double sws_dcVec(SwsVector *a){ | ||
2661 | int i; | ||
2662 | double sum=0; | ||
2663 | |||
2664 | for (i=0; i<a->length; i++) | ||
2665 | sum+= a->coeff[i]; | ||
2666 | |||
2667 | return sum; | ||
2668 | } | ||
2669 | |||
2670 | void sws_scaleVec(SwsVector *a, double scalar){ | ||
2671 | int i; | ||
2672 | |||
2673 | for (i=0; i<a->length; i++) | ||
2674 | a->coeff[i]*= scalar; | ||
2675 | } | ||
2676 | |||
2677 | void sws_normalizeVec(SwsVector *a, double height){ | ||
2678 | sws_scaleVec(a, height/sws_dcVec(a)); | ||
2679 | } | ||
2680 | |||
2681 | static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b){ | ||
2682 | int length= a->length + b->length - 1; | ||
2683 | double *coeff= av_malloc(length*sizeof(double)); | ||
2684 | int i, j; | ||
2685 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2686 | |||
2687 | vec->coeff= coeff; | ||
2688 | vec->length= length; | ||
2689 | |||
2690 | for (i=0; i<length; i++) coeff[i]= 0.0; | ||
2691 | |||
2692 | for (i=0; i<a->length; i++) | ||
2693 | { | ||
2694 | for (j=0; j<b->length; j++) | ||
2695 | { | ||
2696 | coeff[i+j]+= a->coeff[i]*b->coeff[j]; | ||
2697 | } | ||
2698 | } | ||
2699 | |||
2700 | return vec; | ||
2701 | } | ||
2702 | |||
2703 | static SwsVector *sws_sumVec(SwsVector *a, SwsVector *b){ | ||
2704 | int length= FFMAX(a->length, b->length); | ||
2705 | double *coeff= av_malloc(length*sizeof(double)); | ||
2706 | int i; | ||
2707 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2708 | |||
2709 | vec->coeff= coeff; | ||
2710 | vec->length= length; | ||
2711 | |||
2712 | for (i=0; i<length; i++) coeff[i]= 0.0; | ||
2713 | |||
2714 | for (i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i]; | ||
2715 | for (i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i]; | ||
2716 | |||
2717 | return vec; | ||
2718 | } | ||
2719 | |||
2720 | static SwsVector *sws_diffVec(SwsVector *a, SwsVector *b){ | ||
2721 | int length= FFMAX(a->length, b->length); | ||
2722 | double *coeff= av_malloc(length*sizeof(double)); | ||
2723 | int i; | ||
2724 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2725 | |||
2726 | vec->coeff= coeff; | ||
2727 | vec->length= length; | ||
2728 | |||
2729 | for (i=0; i<length; i++) coeff[i]= 0.0; | ||
2730 | |||
2731 | for (i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i]; | ||
2732 | for (i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i]; | ||
2733 | |||
2734 | return vec; | ||
2735 | } | ||
2736 | |||
2737 | /* shift left / or right if "shift" is negative */ | ||
2738 | static SwsVector *sws_getShiftedVec(SwsVector *a, int shift){ | ||
2739 | int length= a->length + FFABS(shift)*2; | ||
2740 | double *coeff= av_malloc(length*sizeof(double)); | ||
2741 | int i; | ||
2742 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2743 | |||
2744 | vec->coeff= coeff; | ||
2745 | vec->length= length; | ||
2746 | |||
2747 | for (i=0; i<length; i++) coeff[i]= 0.0; | ||
2748 | |||
2749 | for (i=0; i<a->length; i++) | ||
2750 | { | ||
2751 | coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i]; | ||
2752 | } | ||
2753 | |||
2754 | return vec; | ||
2755 | } | ||
2756 | |||
2757 | void sws_shiftVec(SwsVector *a, int shift){ | ||
2758 | SwsVector *shifted= sws_getShiftedVec(a, shift); | ||
2759 | av_free(a->coeff); | ||
2760 | a->coeff= shifted->coeff; | ||
2761 | a->length= shifted->length; | ||
2762 | av_free(shifted); | ||
2763 | } | ||
2764 | |||
2765 | void sws_addVec(SwsVector *a, SwsVector *b){ | ||
2766 | SwsVector *sum= sws_sumVec(a, b); | ||
2767 | av_free(a->coeff); | ||
2768 | a->coeff= sum->coeff; | ||
2769 | a->length= sum->length; | ||
2770 | av_free(sum); | ||
2771 | } | ||
2772 | |||
2773 | void sws_subVec(SwsVector *a, SwsVector *b){ | ||
2774 | SwsVector *diff= sws_diffVec(a, b); | ||
2775 | av_free(a->coeff); | ||
2776 | a->coeff= diff->coeff; | ||
2777 | a->length= diff->length; | ||
2778 | av_free(diff); | ||
2779 | } | ||
2780 | |||
2781 | void sws_convVec(SwsVector *a, SwsVector *b){ | ||
2782 | SwsVector *conv= sws_getConvVec(a, b); | ||
2783 | av_free(a->coeff); | ||
2784 | a->coeff= conv->coeff; | ||
2785 | a->length= conv->length; | ||
2786 | av_free(conv); | ||
2787 | } | ||
2788 | |||
2789 | SwsVector *sws_cloneVec(SwsVector *a){ | ||
2790 | double *coeff= av_malloc(a->length*sizeof(double)); | ||
2791 | int i; | ||
2792 | SwsVector *vec= av_malloc(sizeof(SwsVector)); | ||
2793 | |||
2794 | vec->coeff= coeff; | ||
2795 | vec->length= a->length; | ||
2796 | |||
2797 | for (i=0; i<a->length; i++) coeff[i]= a->coeff[i]; | ||
2798 | |||
2799 | return vec; | ||
2800 | } | ||
2801 | |||
2802 | void sws_printVec(SwsVector *a){ | ||
2803 | int i; | ||
2804 | double max=0; | ||
2805 | double min=0; | ||
2806 | double range; | ||
2807 | |||
2808 | for (i=0; i<a->length; i++) | ||
2809 | if (a->coeff[i]>max) max= a->coeff[i]; | ||
2810 | |||
2811 | for (i=0; i<a->length; i++) | ||
2812 | if (a->coeff[i]<min) min= a->coeff[i]; | ||
2813 | |||
2814 | range= max - min; | ||
2815 | |||
2816 | for (i=0; i<a->length; i++) | ||
2817 | { | ||
2818 | int x= (int)((a->coeff[i]-min)*60.0/range +0.5); | ||
2819 | av_log(NULL, AV_LOG_DEBUG, "%1.3f ", a->coeff[i]); | ||
2820 | for (;x>0; x--) av_log(NULL, AV_LOG_DEBUG, " "); | ||
2821 | av_log(NULL, AV_LOG_DEBUG, "|\n"); | ||
2822 | } | ||
2823 | } | ||
2824 | |||
2825 | void sws_freeVec(SwsVector *a){ | ||
2826 | if (!a) return; | ||
2827 | av_freep(&a->coeff); | ||
2828 | a->length=0; | ||
2829 | av_free(a); | ||
2830 | } | ||
2831 | |||
2832 | void sws_freeFilter(SwsFilter *filter){ | ||
2833 | if (!filter) return; | ||
2834 | |||
2835 | if (filter->lumH) sws_freeVec(filter->lumH); | ||
2836 | if (filter->lumV) sws_freeVec(filter->lumV); | ||
2837 | if (filter->chrH) sws_freeVec(filter->chrH); | ||
2838 | if (filter->chrV) sws_freeVec(filter->chrV); | ||
2839 | av_free(filter); | ||
2840 | } | ||
2841 | |||
2842 | |||
2843 | void sws_freeContext(SwsContext *c){ | ||
2844 | int i; | ||
2845 | if (!c) return; | ||
2846 | |||
2847 | if (c->lumPixBuf) | ||
2848 | { | ||
2849 | for (i=0; i<c->vLumBufSize; i++) | ||
2850 | av_freep(&c->lumPixBuf[i]); | ||
2851 | av_freep(&c->lumPixBuf); | ||
2852 | } | ||
2853 | |||
2854 | if (c->chrPixBuf) | ||
2855 | { | ||
2856 | for (i=0; i<c->vChrBufSize; i++) | ||
2857 | av_freep(&c->chrPixBuf[i]); | ||
2858 | av_freep(&c->chrPixBuf); | ||
2859 | } | ||
2860 | |||
2861 | av_freep(&c->vLumFilter); | ||
2862 | av_freep(&c->vChrFilter); | ||
2863 | av_freep(&c->hLumFilter); | ||
2864 | av_freep(&c->hChrFilter); | ||
2865 | #ifdef HAVE_ALTIVEC | ||
2866 | av_freep(&c->vYCoeffsBank); | ||
2867 | av_freep(&c->vCCoeffsBank); | ||
2868 | #endif | ||
2869 | |||
2870 | av_freep(&c->vLumFilterPos); | ||
2871 | av_freep(&c->vChrFilterPos); | ||
2872 | av_freep(&c->hLumFilterPos); | ||
2873 | av_freep(&c->hChrFilterPos); | ||
2874 | |||
2875 | #if defined(ARCH_X86) && defined(CONFIG_GPL) | ||
2876 | #ifdef MAP_ANONYMOUS | ||
2877 | if (c->funnyYCode) munmap(c->funnyYCode, MAX_FUNNY_CODE_SIZE); | ||
2878 | if (c->funnyUVCode) munmap(c->funnyUVCode, MAX_FUNNY_CODE_SIZE); | ||
2879 | #else | ||
2880 | av_free(c->funnyYCode); | ||
2881 | av_free(c->funnyUVCode); | ||
2882 | #endif | ||
2883 | c->funnyYCode=NULL; | ||
2884 | c->funnyUVCode=NULL; | ||
2885 | #endif /* defined(ARCH_X86) */ | ||
2886 | |||
2887 | av_freep(&c->lumMmx2Filter); | ||
2888 | av_freep(&c->chrMmx2Filter); | ||
2889 | av_freep(&c->lumMmx2FilterPos); | ||
2890 | av_freep(&c->chrMmx2FilterPos); | ||
2891 | av_freep(&c->yuvTable); | ||
2892 | |||
2893 | av_free(c); | ||
2894 | } | ||
2895 | |||
2896 | /** | ||
2897 | * Checks if context is valid or reallocs a new one instead. | ||
2898 | * If context is NULL, just calls sws_getContext() to get a new one. | ||
2899 | * Otherwise, checks if the parameters are the same already saved in context. | ||
2900 | * If that is the case, returns the current context. | ||
2901 | * Otherwise, frees context and gets a new one. | ||
2902 | * | ||
2903 | * Be warned that srcFilter, dstFilter are not checked, they are | ||
2904 | * asumed to remain valid. | ||
2905 | */ | ||
2906 | struct SwsContext *sws_getCachedContext(struct SwsContext *context, | ||
2907 | int srcW, int srcH, int srcFormat, | ||
2908 | int dstW, int dstH, int dstFormat, int flags, | ||
2909 | SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param) | ||
2910 | { | ||
2911 | static const double default_param[2] = {SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT}; | ||
2912 | |||
2913 | if (!param) | ||
2914 | param = default_param; | ||
2915 | |||
2916 | if (context) { | ||
2917 | if (context->srcW != srcW || context->srcH != srcH || | ||
2918 | context->srcFormat != srcFormat || | ||
2919 | context->dstW != dstW || context->dstH != dstH || | ||
2920 | context->dstFormat != dstFormat || context->flags != flags || | ||
2921 | context->param[0] != param[0] || context->param[1] != param[1]) | ||
2922 | { | ||
2923 | sws_freeContext(context); | ||
2924 | context = NULL; | ||
2925 | } | ||
2926 | } | ||
2927 | if (!context) { | ||
2928 | return sws_getContext(srcW, srcH, srcFormat, | ||
2929 | dstW, dstH, dstFormat, flags, | ||
2930 | srcFilter, dstFilter, param); | ||
2931 | } | ||
2932 | return context; | ||
2933 | } | ||
2934 | |||
diff --git a/src/plugins/ffmpeg/libswscale/swscale.h b/src/plugins/ffmpeg/libswscale/swscale.h deleted file mode 100644 index 3a5b460..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale.h +++ /dev/null | |||
@@ -1,146 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2.1 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | */ | ||
20 | |||
21 | #ifndef FFMPEG_SWSCALE_H | ||
22 | #define FFMPEG_SWSCALE_H | ||
23 | |||
24 | /** | ||
25 | * @file swscale.h | ||
26 | * @brief | ||
27 | * external api for the swscale stuff | ||
28 | */ | ||
29 | |||
30 | #include "libavutil/avutil.h" | ||
31 | |||
32 | #define LIBSWSCALE_VERSION_MAJOR 0 | ||
33 | #define LIBSWSCALE_VERSION_MINOR 5 | ||
34 | #define LIBSWSCALE_VERSION_MICRO 1 | ||
35 | |||
36 | #define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \ | ||
37 | LIBSWSCALE_VERSION_MINOR, \ | ||
38 | LIBSWSCALE_VERSION_MICRO) | ||
39 | #define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \ | ||
40 | LIBSWSCALE_VERSION_MINOR, \ | ||
41 | LIBSWSCALE_VERSION_MICRO) | ||
42 | #define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT | ||
43 | |||
44 | #define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION) | ||
45 | |||
46 | /* values for the flags, the stuff on the command line is different */ | ||
47 | #define SWS_FAST_BILINEAR 1 | ||
48 | #define SWS_BILINEAR 2 | ||
49 | #define SWS_BICUBIC 4 | ||
50 | #define SWS_X 8 | ||
51 | #define SWS_POINT 0x10 | ||
52 | #define SWS_AREA 0x20 | ||
53 | #define SWS_BICUBLIN 0x40 | ||
54 | #define SWS_GAUSS 0x80 | ||
55 | #define SWS_SINC 0x100 | ||
56 | #define SWS_LANCZOS 0x200 | ||
57 | #define SWS_SPLINE 0x400 | ||
58 | |||
59 | #define SWS_SRC_V_CHR_DROP_MASK 0x30000 | ||
60 | #define SWS_SRC_V_CHR_DROP_SHIFT 16 | ||
61 | |||
62 | #define SWS_PARAM_DEFAULT 123456 | ||
63 | |||
64 | #define SWS_PRINT_INFO 0x1000 | ||
65 | |||
66 | //the following 3 flags are not completely implemented | ||
67 | //internal chrominace subsampling info | ||
68 | #define SWS_FULL_CHR_H_INT 0x2000 | ||
69 | //input subsampling info | ||
70 | #define SWS_FULL_CHR_H_INP 0x4000 | ||
71 | #define SWS_DIRECT_BGR 0x8000 | ||
72 | #define SWS_ACCURATE_RND 0x40000 | ||
73 | |||
74 | #define SWS_CPU_CAPS_MMX 0x80000000 | ||
75 | #define SWS_CPU_CAPS_MMX2 0x20000000 | ||
76 | #define SWS_CPU_CAPS_3DNOW 0x40000000 | ||
77 | #define SWS_CPU_CAPS_ALTIVEC 0x10000000 | ||
78 | #define SWS_CPU_CAPS_BFIN 0x01000000 | ||
79 | |||
80 | #define SWS_MAX_REDUCE_CUTOFF 0.002 | ||
81 | |||
82 | #define SWS_CS_ITU709 1 | ||
83 | #define SWS_CS_FCC 4 | ||
84 | #define SWS_CS_ITU601 5 | ||
85 | #define SWS_CS_ITU624 5 | ||
86 | #define SWS_CS_SMPTE170M 5 | ||
87 | #define SWS_CS_SMPTE240M 7 | ||
88 | #define SWS_CS_DEFAULT 5 | ||
89 | |||
90 | |||
91 | |||
92 | // when used for filters they must have an odd number of elements | ||
93 | // coeffs cannot be shared between vectors | ||
94 | typedef struct { | ||
95 | double *coeff; | ||
96 | int length; | ||
97 | } SwsVector; | ||
98 | |||
99 | // vectors can be shared | ||
100 | typedef struct { | ||
101 | SwsVector *lumH; | ||
102 | SwsVector *lumV; | ||
103 | SwsVector *chrH; | ||
104 | SwsVector *chrV; | ||
105 | } SwsFilter; | ||
106 | |||
107 | struct SwsContext; | ||
108 | |||
109 | void sws_freeContext(struct SwsContext *swsContext); | ||
110 | |||
111 | struct SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags, | ||
112 | SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param); | ||
113 | int sws_scale(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, | ||
114 | int srcSliceH, uint8_t* dst[], int dstStride[]); | ||
115 | int sws_scale_ordered(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, | ||
116 | int srcSliceH, uint8_t* dst[], int dstStride[]) attribute_deprecated; | ||
117 | |||
118 | |||
119 | int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation); | ||
120 | int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation); | ||
121 | SwsVector *sws_getGaussianVec(double variance, double quality); | ||
122 | SwsVector *sws_getConstVec(double c, int length); | ||
123 | SwsVector *sws_getIdentityVec(void); | ||
124 | void sws_scaleVec(SwsVector *a, double scalar); | ||
125 | void sws_normalizeVec(SwsVector *a, double height); | ||
126 | void sws_convVec(SwsVector *a, SwsVector *b); | ||
127 | void sws_addVec(SwsVector *a, SwsVector *b); | ||
128 | void sws_subVec(SwsVector *a, SwsVector *b); | ||
129 | void sws_shiftVec(SwsVector *a, int shift); | ||
130 | SwsVector *sws_cloneVec(SwsVector *a); | ||
131 | |||
132 | void sws_printVec(SwsVector *a); | ||
133 | void sws_freeVec(SwsVector *a); | ||
134 | |||
135 | SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur, | ||
136 | float lumaSarpen, float chromaSharpen, | ||
137 | float chromaHShift, float chromaVShift, | ||
138 | int verbose); | ||
139 | void sws_freeFilter(SwsFilter *filter); | ||
140 | |||
141 | struct SwsContext *sws_getCachedContext(struct SwsContext *context, | ||
142 | int srcW, int srcH, int srcFormat, | ||
143 | int dstW, int dstH, int dstFormat, int flags, | ||
144 | SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param); | ||
145 | |||
146 | #endif /* FFMPEG_SWSCALE_H */ | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale_altivec_template.c b/src/plugins/ffmpeg/libswscale/swscale_altivec_template.c deleted file mode 100644 index 2111cec..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale_altivec_template.c +++ /dev/null | |||
@@ -1,538 +0,0 @@ | |||
1 | /* | ||
2 | * AltiVec-enhanced yuv2yuvX | ||
3 | * | ||
4 | * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org> | ||
5 | * based on the equivalent C code in swscale.c | ||
6 | * | ||
7 | * This file is part of FFmpeg. | ||
8 | * | ||
9 | * FFmpeg is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * FFmpeg is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with FFmpeg; if not, write to the Free Software | ||
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
22 | */ | ||
23 | |||
24 | #define vzero vec_splat_s32(0) | ||
25 | |||
26 | static inline void | ||
27 | altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { | ||
28 | register int i; | ||
29 | vector unsigned int altivec_vectorShiftInt19 = | ||
30 | vec_add(vec_splat_u32(10), vec_splat_u32(9)); | ||
31 | if ((unsigned long)dest % 16) { | ||
32 | /* badly aligned store, we force store alignment */ | ||
33 | /* and will handle load misalignment on val w/ vec_perm */ | ||
34 | vector unsigned char perm1; | ||
35 | vector signed int v1; | ||
36 | for (i = 0 ; (i < dstW) && | ||
37 | (((unsigned long)dest + i) % 16) ; i++) { | ||
38 | int t = val[i] >> 19; | ||
39 | dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); | ||
40 | } | ||
41 | perm1 = vec_lvsl(i << 2, val); | ||
42 | v1 = vec_ld(i << 2, val); | ||
43 | for ( ; i < (dstW - 15); i+=16) { | ||
44 | int offset = i << 2; | ||
45 | vector signed int v2 = vec_ld(offset + 16, val); | ||
46 | vector signed int v3 = vec_ld(offset + 32, val); | ||
47 | vector signed int v4 = vec_ld(offset + 48, val); | ||
48 | vector signed int v5 = vec_ld(offset + 64, val); | ||
49 | vector signed int v12 = vec_perm(v1, v2, perm1); | ||
50 | vector signed int v23 = vec_perm(v2, v3, perm1); | ||
51 | vector signed int v34 = vec_perm(v3, v4, perm1); | ||
52 | vector signed int v45 = vec_perm(v4, v5, perm1); | ||
53 | |||
54 | vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); | ||
55 | vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); | ||
56 | vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); | ||
57 | vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); | ||
58 | vector unsigned short vs1 = vec_packsu(vA, vB); | ||
59 | vector unsigned short vs2 = vec_packsu(vC, vD); | ||
60 | vector unsigned char vf = vec_packsu(vs1, vs2); | ||
61 | vec_st(vf, i, dest); | ||
62 | v1 = v5; | ||
63 | } | ||
64 | } else { // dest is properly aligned, great | ||
65 | for (i = 0; i < (dstW - 15); i+=16) { | ||
66 | int offset = i << 2; | ||
67 | vector signed int v1 = vec_ld(offset, val); | ||
68 | vector signed int v2 = vec_ld(offset + 16, val); | ||
69 | vector signed int v3 = vec_ld(offset + 32, val); | ||
70 | vector signed int v4 = vec_ld(offset + 48, val); | ||
71 | vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19); | ||
72 | vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19); | ||
73 | vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19); | ||
74 | vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19); | ||
75 | vector unsigned short vs1 = vec_packsu(v5, v6); | ||
76 | vector unsigned short vs2 = vec_packsu(v7, v8); | ||
77 | vector unsigned char vf = vec_packsu(vs1, vs2); | ||
78 | vec_st(vf, i, dest); | ||
79 | } | ||
80 | } | ||
81 | for ( ; i < dstW ; i++) { | ||
82 | int t = val[i] >> 19; | ||
83 | dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); | ||
84 | } | ||
85 | } | ||
86 | |||
87 | static inline void | ||
88 | yuv2yuvX_altivec_real(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
89 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
90 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) | ||
91 | { | ||
92 | const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; | ||
93 | register int i, j; | ||
94 | { | ||
95 | int __attribute__ ((aligned (16))) val[dstW]; | ||
96 | |||
97 | for (i = 0; i < (dstW -7); i+=4) { | ||
98 | vec_st(vini, i << 2, val); | ||
99 | } | ||
100 | for (; i < dstW; i++) { | ||
101 | val[i] = (1 << 18); | ||
102 | } | ||
103 | |||
104 | for (j = 0; j < lumFilterSize; j++) { | ||
105 | vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter); | ||
106 | vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter); | ||
107 | vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); | ||
108 | vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter | ||
109 | |||
110 | perm = vec_lvsl(0, lumSrc[j]); | ||
111 | l1 = vec_ld(0, lumSrc[j]); | ||
112 | |||
113 | for (i = 0; i < (dstW - 7); i+=8) { | ||
114 | int offset = i << 2; | ||
115 | vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]); | ||
116 | |||
117 | vector signed int v1 = vec_ld(offset, val); | ||
118 | vector signed int v2 = vec_ld(offset + 16, val); | ||
119 | |||
120 | vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7] | ||
121 | |||
122 | vector signed int i1 = vec_mule(vLumFilter, ls); | ||
123 | vector signed int i2 = vec_mulo(vLumFilter, ls); | ||
124 | |||
125 | vector signed int vf1 = vec_mergeh(i1, i2); | ||
126 | vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j] | ||
127 | |||
128 | vector signed int vo1 = vec_add(v1, vf1); | ||
129 | vector signed int vo2 = vec_add(v2, vf2); | ||
130 | |||
131 | vec_st(vo1, offset, val); | ||
132 | vec_st(vo2, offset + 16, val); | ||
133 | |||
134 | l1 = l2; | ||
135 | } | ||
136 | for ( ; i < dstW; i++) { | ||
137 | val[i] += lumSrc[j][i] * lumFilter[j]; | ||
138 | } | ||
139 | } | ||
140 | altivec_packIntArrayToCharArray(val, dest, dstW); | ||
141 | } | ||
142 | if (uDest != 0) { | ||
143 | int __attribute__ ((aligned (16))) u[chrDstW]; | ||
144 | int __attribute__ ((aligned (16))) v[chrDstW]; | ||
145 | |||
146 | for (i = 0; i < (chrDstW -7); i+=4) { | ||
147 | vec_st(vini, i << 2, u); | ||
148 | vec_st(vini, i << 2, v); | ||
149 | } | ||
150 | for (; i < chrDstW; i++) { | ||
151 | u[i] = (1 << 18); | ||
152 | v[i] = (1 << 18); | ||
153 | } | ||
154 | |||
155 | for (j = 0; j < chrFilterSize; j++) { | ||
156 | vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter); | ||
157 | vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter); | ||
158 | vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0); | ||
159 | vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter | ||
160 | |||
161 | perm = vec_lvsl(0, chrSrc[j]); | ||
162 | l1 = vec_ld(0, chrSrc[j]); | ||
163 | l1_V = vec_ld(2048 << 1, chrSrc[j]); | ||
164 | |||
165 | for (i = 0; i < (chrDstW - 7); i+=8) { | ||
166 | int offset = i << 2; | ||
167 | vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]); | ||
168 | vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]); | ||
169 | |||
170 | vector signed int v1 = vec_ld(offset, u); | ||
171 | vector signed int v2 = vec_ld(offset + 16, u); | ||
172 | vector signed int v1_V = vec_ld(offset, v); | ||
173 | vector signed int v2_V = vec_ld(offset + 16, v); | ||
174 | |||
175 | vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7] | ||
176 | vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055] | ||
177 | |||
178 | vector signed int i1 = vec_mule(vChrFilter, ls); | ||
179 | vector signed int i2 = vec_mulo(vChrFilter, ls); | ||
180 | vector signed int i1_V = vec_mule(vChrFilter, ls_V); | ||
181 | vector signed int i2_V = vec_mulo(vChrFilter, ls_V); | ||
182 | |||
183 | vector signed int vf1 = vec_mergeh(i1, i2); | ||
184 | vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] | ||
185 | vector signed int vf1_V = vec_mergeh(i1_V, i2_V); | ||
186 | vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] | ||
187 | |||
188 | vector signed int vo1 = vec_add(v1, vf1); | ||
189 | vector signed int vo2 = vec_add(v2, vf2); | ||
190 | vector signed int vo1_V = vec_add(v1_V, vf1_V); | ||
191 | vector signed int vo2_V = vec_add(v2_V, vf2_V); | ||
192 | |||
193 | vec_st(vo1, offset, u); | ||
194 | vec_st(vo2, offset + 16, u); | ||
195 | vec_st(vo1_V, offset, v); | ||
196 | vec_st(vo2_V, offset + 16, v); | ||
197 | |||
198 | l1 = l2; | ||
199 | l1_V = l2_V; | ||
200 | } | ||
201 | for ( ; i < chrDstW; i++) { | ||
202 | u[i] += chrSrc[j][i] * chrFilter[j]; | ||
203 | v[i] += chrSrc[j][i + 2048] * chrFilter[j]; | ||
204 | } | ||
205 | } | ||
206 | altivec_packIntArrayToCharArray(u, uDest, chrDstW); | ||
207 | altivec_packIntArrayToCharArray(v, vDest, chrDstW); | ||
208 | } | ||
209 | } | ||
210 | |||
211 | static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, int16_t *filter, int16_t *filterPos, int filterSize) { | ||
212 | register int i; | ||
213 | int __attribute__ ((aligned (16))) tempo[4]; | ||
214 | |||
215 | if (filterSize % 4) { | ||
216 | for (i=0; i<dstW; i++) { | ||
217 | register int j; | ||
218 | register int srcPos = filterPos[i]; | ||
219 | register int val = 0; | ||
220 | for (j=0; j<filterSize; j++) { | ||
221 | val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | ||
222 | } | ||
223 | dst[i] = av_clip(val>>7, 0, (1<<15)-1); | ||
224 | } | ||
225 | } | ||
226 | else | ||
227 | switch (filterSize) { | ||
228 | case 4: | ||
229 | { | ||
230 | for (i=0; i<dstW; i++) { | ||
231 | register int srcPos = filterPos[i]; | ||
232 | |||
233 | vector unsigned char src_v0 = vec_ld(srcPos, src); | ||
234 | vector unsigned char src_v1, src_vF; | ||
235 | vector signed short src_v, filter_v; | ||
236 | vector signed int val_vEven, val_s; | ||
237 | if ((((int)src + srcPos)% 16) > 12) { | ||
238 | src_v1 = vec_ld(srcPos + 16, src); | ||
239 | } | ||
240 | src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); | ||
241 | |||
242 | src_v = // vec_unpackh sign-extends... | ||
243 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | ||
244 | // now put our elements in the even slots | ||
245 | src_v = vec_mergeh(src_v, (vector signed short)vzero); | ||
246 | |||
247 | filter_v = vec_ld(i << 3, filter); | ||
248 | // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2). | ||
249 | |||
250 | // The neat trick: We only care for half the elements, | ||
251 | // high or low depending on (i<<3)%16 (it's 0 or 8 here), | ||
252 | // and we're going to use vec_mule, so we choose | ||
253 | // carefully how to "unpack" the elements into the even slots. | ||
254 | if ((i << 3) % 16) | ||
255 | filter_v = vec_mergel(filter_v, (vector signed short)vzero); | ||
256 | else | ||
257 | filter_v = vec_mergeh(filter_v, (vector signed short)vzero); | ||
258 | |||
259 | val_vEven = vec_mule(src_v, filter_v); | ||
260 | val_s = vec_sums(val_vEven, vzero); | ||
261 | vec_st(val_s, 0, tempo); | ||
262 | dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); | ||
263 | } | ||
264 | } | ||
265 | break; | ||
266 | |||
267 | case 8: | ||
268 | { | ||
269 | for (i=0; i<dstW; i++) { | ||
270 | register int srcPos = filterPos[i]; | ||
271 | |||
272 | vector unsigned char src_v0 = vec_ld(srcPos, src); | ||
273 | vector unsigned char src_v1, src_vF; | ||
274 | vector signed short src_v, filter_v; | ||
275 | vector signed int val_v, val_s; | ||
276 | if ((((int)src + srcPos)% 16) > 8) { | ||
277 | src_v1 = vec_ld(srcPos + 16, src); | ||
278 | } | ||
279 | src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); | ||
280 | |||
281 | src_v = // vec_unpackh sign-extends... | ||
282 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | ||
283 | filter_v = vec_ld(i << 4, filter); | ||
284 | // the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2) | ||
285 | |||
286 | val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); | ||
287 | val_s = vec_sums(val_v, vzero); | ||
288 | vec_st(val_s, 0, tempo); | ||
289 | dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); | ||
290 | } | ||
291 | } | ||
292 | break; | ||
293 | |||
294 | case 16: | ||
295 | { | ||
296 | for (i=0; i<dstW; i++) { | ||
297 | register int srcPos = filterPos[i]; | ||
298 | |||
299 | vector unsigned char src_v0 = vec_ld(srcPos, src); | ||
300 | vector unsigned char src_v1 = vec_ld(srcPos + 16, src); | ||
301 | vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); | ||
302 | |||
303 | vector signed short src_vA = // vec_unpackh sign-extends... | ||
304 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | ||
305 | vector signed short src_vB = // vec_unpackh sign-extends... | ||
306 | (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); | ||
307 | |||
308 | vector signed short filter_v0 = vec_ld(i << 5, filter); | ||
309 | vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); | ||
310 | // the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2) | ||
311 | |||
312 | vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); | ||
313 | vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); | ||
314 | |||
315 | vector signed int val_s = vec_sums(val_v, vzero); | ||
316 | |||
317 | vec_st(val_s, 0, tempo); | ||
318 | dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); | ||
319 | } | ||
320 | } | ||
321 | break; | ||
322 | |||
323 | default: | ||
324 | { | ||
325 | for (i=0; i<dstW; i++) { | ||
326 | register int j; | ||
327 | register int srcPos = filterPos[i]; | ||
328 | |||
329 | vector signed int val_s, val_v = (vector signed int)vzero; | ||
330 | vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); | ||
331 | vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter); | ||
332 | |||
333 | vector unsigned char src_v0 = vec_ld(srcPos, src); | ||
334 | vector unsigned char permS = vec_lvsl(srcPos, src); | ||
335 | |||
336 | for (j = 0 ; j < filterSize - 15; j += 16) { | ||
337 | vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); | ||
338 | vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); | ||
339 | |||
340 | vector signed short src_vA = // vec_unpackh sign-extends... | ||
341 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | ||
342 | vector signed short src_vB = // vec_unpackh sign-extends... | ||
343 | (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); | ||
344 | |||
345 | vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); | ||
346 | vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); | ||
347 | vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); | ||
348 | vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); | ||
349 | |||
350 | vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); | ||
351 | val_v = vec_msums(src_vB, filter_v1, val_acc); | ||
352 | |||
353 | filter_v0R = filter_v2R; | ||
354 | src_v0 = src_v1; | ||
355 | } | ||
356 | |||
357 | if (j < filterSize-7) { | ||
358 | // loading src_v0 is useless, it's already done above | ||
359 | //vector unsigned char src_v0 = vec_ld(srcPos + j, src); | ||
360 | vector unsigned char src_v1, src_vF; | ||
361 | vector signed short src_v, filter_v1R, filter_v; | ||
362 | if ((((int)src + srcPos)% 16) > 8) { | ||
363 | src_v1 = vec_ld(srcPos + j + 16, src); | ||
364 | } | ||
365 | src_vF = vec_perm(src_v0, src_v1, permS); | ||
366 | |||
367 | src_v = // vec_unpackh sign-extends... | ||
368 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); | ||
369 | // loading filter_v0R is useless, it's already done above | ||
370 | //vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter); | ||
371 | filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); | ||
372 | filter_v = vec_perm(filter_v0R, filter_v1R, permF); | ||
373 | |||
374 | val_v = vec_msums(src_v, filter_v, val_v); | ||
375 | } | ||
376 | |||
377 | val_s = vec_sums(val_v, vzero); | ||
378 | |||
379 | vec_st(val_s, 0, tempo); | ||
380 | dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1); | ||
381 | } | ||
382 | |||
383 | } | ||
384 | } | ||
385 | } | ||
386 | |||
387 | static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
388 | int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) { | ||
389 | uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; | ||
390 | // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]); | ||
391 | uint8_t *ysrc = src[0]; | ||
392 | uint8_t *usrc = src[1]; | ||
393 | uint8_t *vsrc = src[2]; | ||
394 | const int width = c->srcW; | ||
395 | const int height = srcSliceH; | ||
396 | const int lumStride = srcStride[0]; | ||
397 | const int chromStride = srcStride[1]; | ||
398 | const int dstStride = dstStride_a[0]; | ||
399 | const vector unsigned char yperm = vec_lvsl(0, ysrc); | ||
400 | const int vertLumPerChroma = 2; | ||
401 | register unsigned int y; | ||
402 | |||
403 | if (width&15) { | ||
404 | yv12toyuy2(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride); | ||
405 | return srcSliceH; | ||
406 | } | ||
407 | |||
408 | /* This code assumes: | ||
409 | |||
410 | 1) dst is 16 bytes-aligned | ||
411 | 2) dstStride is a multiple of 16 | ||
412 | 3) width is a multiple of 16 | ||
413 | 4) lum & chrom stride are multiples of 8 | ||
414 | */ | ||
415 | |||
416 | for (y=0; y<height; y++) { | ||
417 | int i; | ||
418 | for (i = 0; i < width - 31; i+= 32) { | ||
419 | const unsigned int j = i >> 1; | ||
420 | vector unsigned char v_yA = vec_ld(i, ysrc); | ||
421 | vector unsigned char v_yB = vec_ld(i + 16, ysrc); | ||
422 | vector unsigned char v_yC = vec_ld(i + 32, ysrc); | ||
423 | vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); | ||
424 | vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); | ||
425 | vector unsigned char v_uA = vec_ld(j, usrc); | ||
426 | vector unsigned char v_uB = vec_ld(j + 16, usrc); | ||
427 | vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); | ||
428 | vector unsigned char v_vA = vec_ld(j, vsrc); | ||
429 | vector unsigned char v_vB = vec_ld(j + 16, vsrc); | ||
430 | vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); | ||
431 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); | ||
432 | vector unsigned char v_uv_b = vec_mergel(v_u, v_v); | ||
433 | vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); | ||
434 | vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); | ||
435 | vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b); | ||
436 | vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b); | ||
437 | vec_st(v_yuy2_0, (i << 1), dst); | ||
438 | vec_st(v_yuy2_1, (i << 1) + 16, dst); | ||
439 | vec_st(v_yuy2_2, (i << 1) + 32, dst); | ||
440 | vec_st(v_yuy2_3, (i << 1) + 48, dst); | ||
441 | } | ||
442 | if (i < width) { | ||
443 | const unsigned int j = i >> 1; | ||
444 | vector unsigned char v_y1 = vec_ld(i, ysrc); | ||
445 | vector unsigned char v_u = vec_ld(j, usrc); | ||
446 | vector unsigned char v_v = vec_ld(j, vsrc); | ||
447 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); | ||
448 | vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); | ||
449 | vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); | ||
450 | vec_st(v_yuy2_0, (i << 1), dst); | ||
451 | vec_st(v_yuy2_1, (i << 1) + 16, dst); | ||
452 | } | ||
453 | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | ||
454 | usrc += chromStride; | ||
455 | vsrc += chromStride; | ||
456 | } | ||
457 | ysrc += lumStride; | ||
458 | dst += dstStride; | ||
459 | } | ||
460 | |||
461 | return srcSliceH; | ||
462 | } | ||
463 | |||
464 | static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
465 | int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) { | ||
466 | uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; | ||
467 | // yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]); | ||
468 | uint8_t *ysrc = src[0]; | ||
469 | uint8_t *usrc = src[1]; | ||
470 | uint8_t *vsrc = src[2]; | ||
471 | const int width = c->srcW; | ||
472 | const int height = srcSliceH; | ||
473 | const int lumStride = srcStride[0]; | ||
474 | const int chromStride = srcStride[1]; | ||
475 | const int dstStride = dstStride_a[0]; | ||
476 | const int vertLumPerChroma = 2; | ||
477 | const vector unsigned char yperm = vec_lvsl(0, ysrc); | ||
478 | register unsigned int y; | ||
479 | |||
480 | if (width&15) { | ||
481 | yv12touyvy(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride); | ||
482 | return srcSliceH; | ||
483 | } | ||
484 | |||
485 | /* This code assumes: | ||
486 | |||
487 | 1) dst is 16 bytes-aligned | ||
488 | 2) dstStride is a multiple of 16 | ||
489 | 3) width is a multiple of 16 | ||
490 | 4) lum & chrom stride are multiples of 8 | ||
491 | */ | ||
492 | |||
493 | for (y=0; y<height; y++) { | ||
494 | int i; | ||
495 | for (i = 0; i < width - 31; i+= 32) { | ||
496 | const unsigned int j = i >> 1; | ||
497 | vector unsigned char v_yA = vec_ld(i, ysrc); | ||
498 | vector unsigned char v_yB = vec_ld(i + 16, ysrc); | ||
499 | vector unsigned char v_yC = vec_ld(i + 32, ysrc); | ||
500 | vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); | ||
501 | vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); | ||
502 | vector unsigned char v_uA = vec_ld(j, usrc); | ||
503 | vector unsigned char v_uB = vec_ld(j + 16, usrc); | ||
504 | vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); | ||
505 | vector unsigned char v_vA = vec_ld(j, vsrc); | ||
506 | vector unsigned char v_vB = vec_ld(j + 16, vsrc); | ||
507 | vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); | ||
508 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); | ||
509 | vector unsigned char v_uv_b = vec_mergel(v_u, v_v); | ||
510 | vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); | ||
511 | vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); | ||
512 | vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2); | ||
513 | vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2); | ||
514 | vec_st(v_uyvy_0, (i << 1), dst); | ||
515 | vec_st(v_uyvy_1, (i << 1) + 16, dst); | ||
516 | vec_st(v_uyvy_2, (i << 1) + 32, dst); | ||
517 | vec_st(v_uyvy_3, (i << 1) + 48, dst); | ||
518 | } | ||
519 | if (i < width) { | ||
520 | const unsigned int j = i >> 1; | ||
521 | vector unsigned char v_y1 = vec_ld(i, ysrc); | ||
522 | vector unsigned char v_u = vec_ld(j, usrc); | ||
523 | vector unsigned char v_v = vec_ld(j, vsrc); | ||
524 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); | ||
525 | vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); | ||
526 | vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); | ||
527 | vec_st(v_uyvy_0, (i << 1), dst); | ||
528 | vec_st(v_uyvy_1, (i << 1) + 16, dst); | ||
529 | } | ||
530 | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | ||
531 | usrc += chromStride; | ||
532 | vsrc += chromStride; | ||
533 | } | ||
534 | ysrc += lumStride; | ||
535 | dst += dstStride; | ||
536 | } | ||
537 | return srcSliceH; | ||
538 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale_avoption.c b/src/plugins/ffmpeg/libswscale/swscale_avoption.c deleted file mode 100644 index 1878b4e..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale_avoption.c +++ /dev/null | |||
@@ -1,59 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2.1 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | */ | ||
20 | |||
21 | #include "libavutil/avutil.h" | ||
22 | #include "libavcodec/opt.h" | ||
23 | #include "swscale.h" | ||
24 | #include "swscale_internal.h" | ||
25 | |||
26 | static const char * sws_context_to_name(void * ptr) { | ||
27 | return "swscaler"; | ||
28 | } | ||
29 | |||
30 | #define OFFSET(x) offsetof(SwsContext, x) | ||
31 | #define DEFAULT 0 | ||
32 | #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM | ||
33 | |||
34 | static const AVOption options[] = { | ||
35 | { "sws_flags", "scaler/cpu flags", OFFSET(flags), FF_OPT_TYPE_FLAGS, DEFAULT, 0, UINT_MAX, VE, "sws_flags" }, | ||
36 | { "fast_bilinear", "fast bilinear", 0, FF_OPT_TYPE_CONST, SWS_FAST_BILINEAR, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
37 | { "bilinear", "bilinear", 0, FF_OPT_TYPE_CONST, SWS_BILINEAR, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
38 | { "bicubic", "bicubic", 0, FF_OPT_TYPE_CONST, SWS_BICUBIC, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
39 | { "experimental", "experimental", 0, FF_OPT_TYPE_CONST, SWS_X, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
40 | { "neighbor", "nearest neighbor", 0, FF_OPT_TYPE_CONST, SWS_POINT, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
41 | { "area", "averaging area", 0, FF_OPT_TYPE_CONST, SWS_AREA, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
42 | { "bicublin", "luma bicubic, chroma bilinear", 0, FF_OPT_TYPE_CONST, SWS_BICUBLIN, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
43 | { "gauss", "gaussian", 0, FF_OPT_TYPE_CONST, SWS_GAUSS, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
44 | { "sinc", "sinc", 0, FF_OPT_TYPE_CONST, SWS_SINC, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
45 | { "lanczos", "lanczos", 0, FF_OPT_TYPE_CONST, SWS_LANCZOS, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
46 | { "spline", "natural bicubic spline", 0, FF_OPT_TYPE_CONST, SWS_SPLINE, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
47 | { "print_info", "print info", 0, FF_OPT_TYPE_CONST, SWS_PRINT_INFO, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
48 | { "accurate_rnd", "accurate rounding", 0, FF_OPT_TYPE_CONST, SWS_ACCURATE_RND, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
49 | { "mmx", "MMX SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_MMX, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
50 | { "mmx2", "MMX2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_MMX2, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
51 | { "3dnow", "3DNOW SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_3DNOW, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
52 | { "altivec", "AltiVec SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_ALTIVEC, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
53 | { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_BFIN, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
54 | { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INT, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
55 | { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INP, INT_MIN, INT_MAX, VE, "sws_flags" }, | ||
56 | { NULL } | ||
57 | }; | ||
58 | |||
59 | const AVClass sws_context_class = { "SWScaler", sws_context_to_name, options }; | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale_bfin.c b/src/plugins/ffmpeg/libswscale/swscale_bfin.c deleted file mode 100644 index 3e63bbd..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale_bfin.c +++ /dev/null | |||
@@ -1,94 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> | ||
3 | * | ||
4 | * Blackfin software video scaler operations | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU Lesser General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2.1 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public | ||
19 | * License along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | */ | ||
22 | |||
23 | #include <stdio.h> | ||
24 | #include <stdlib.h> | ||
25 | #include <string.h> | ||
26 | #include <inttypes.h> | ||
27 | #include <assert.h> | ||
28 | #include "config.h" | ||
29 | #ifdef HAVE_MALLOC_H | ||
30 | #include <malloc.h> | ||
31 | #endif | ||
32 | #include <unistd.h> | ||
33 | #include "rgb2rgb.h" | ||
34 | #include "swscale.h" | ||
35 | #include "swscale_internal.h" | ||
36 | |||
37 | #ifdef __FDPIC__ | ||
38 | #define L1CODE __attribute__ ((l1_text)) | ||
39 | #else | ||
40 | #define L1CODE | ||
41 | #endif | ||
42 | |||
43 | extern int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
44 | long width, long height, | ||
45 | long lumStride, long chromStride, long srcStride) L1CODE; | ||
46 | |||
47 | extern int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, | ||
48 | long width, long height, | ||
49 | long lumStride, long chromStride, long srcStride) L1CODE; | ||
50 | |||
51 | static int uyvytoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
52 | int srcSliceH, uint8_t* dst[], int dstStride[]) | ||
53 | { | ||
54 | uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY; | ||
55 | uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2; | ||
56 | uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2; | ||
57 | uint8_t *ip = src[0] + srcStride[0]*srcSliceY; | ||
58 | int w = dstStride[0]; | ||
59 | |||
60 | ff_bfin_uyvytoyv12 (ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]); | ||
61 | |||
62 | return srcSliceH; | ||
63 | } | ||
64 | |||
65 | static int yuyvtoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
66 | int srcSliceH, uint8_t* dst[], int dstStride[]) | ||
67 | { | ||
68 | uint8_t *dsty = dst[0] + dstStride[0]*srcSliceY; | ||
69 | uint8_t *dstu = dst[1] + dstStride[1]*srcSliceY/2; | ||
70 | uint8_t *dstv = dst[2] + dstStride[2]*srcSliceY/2; | ||
71 | uint8_t *ip = src[0] + srcStride[0]*srcSliceY; | ||
72 | int w = dstStride[0]; | ||
73 | |||
74 | ff_bfin_yuyvtoyv12 (ip, dsty, dstu, dstv, w, srcSliceH, dstStride[0], dstStride[1], srcStride[0]); | ||
75 | |||
76 | return srcSliceH; | ||
77 | } | ||
78 | |||
79 | |||
80 | void ff_bfin_get_unscaled_swscale (SwsContext *c) | ||
81 | { | ||
82 | SwsFunc swScale = c->swScale; | ||
83 | if (c->flags & SWS_CPU_CAPS_BFIN) | ||
84 | if (c->dstFormat == PIX_FMT_YUV420P) | ||
85 | if (c->srcFormat == PIX_FMT_UYVY422) { | ||
86 | av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n"); | ||
87 | c->swScale = uyvytoyv12_unscaled; | ||
88 | } | ||
89 | if (c->dstFormat == PIX_FMT_YUV420P) | ||
90 | if (c->srcFormat == PIX_FMT_YUYV422) { | ||
91 | av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n"); | ||
92 | c->swScale = yuyvtoyv12_unscaled; | ||
93 | } | ||
94 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale_internal.h b/src/plugins/ffmpeg/libswscale/swscale_internal.h deleted file mode 100644 index 14c3a04..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale_internal.h +++ /dev/null | |||
@@ -1,283 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU Lesser General Public | ||
8 | * License as published by the Free Software Foundation; either | ||
9 | * version 2.1 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
14 | * Lesser General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU Lesser General Public | ||
17 | * License along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | */ | ||
20 | |||
21 | #ifndef FFMPEG_SWSCALE_INTERNAL_H | ||
22 | #define FFMPEG_SWSCALE_INTERNAL_H | ||
23 | |||
24 | #include "config.h" | ||
25 | |||
26 | #ifdef HAVE_ALTIVEC_H | ||
27 | #include <altivec.h> | ||
28 | #endif | ||
29 | |||
30 | #include "libavutil/avutil.h" | ||
31 | |||
32 | #define MAX_FILTER_SIZE 256 | ||
33 | |||
34 | #define VOFW 8192 | ||
35 | #define VOF (VOFW*2) | ||
36 | |||
37 | typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, | ||
38 | int srcSliceH, uint8_t* dst[], int dstStride[]); | ||
39 | |||
40 | /* This struct should be aligned on at least a 32-byte boundary. */ | ||
41 | typedef struct SwsContext{ | ||
42 | /** | ||
43 | * info on struct for av_log | ||
44 | */ | ||
45 | const AVClass *av_class; | ||
46 | |||
47 | /** | ||
48 | * Note that src, dst, srcStride, dstStride will be copied in the | ||
49 | * sws_scale() wrapper so they can be freely modified here. | ||
50 | */ | ||
51 | SwsFunc swScale; | ||
52 | int srcW, srcH, dstH; | ||
53 | int chrSrcW, chrSrcH, chrDstW, chrDstH; | ||
54 | int lumXInc, chrXInc; | ||
55 | int lumYInc, chrYInc; | ||
56 | int dstFormat, srcFormat; ///< format 4:2:0 type is always YV12 | ||
57 | int origDstFormat, origSrcFormat; ///< format | ||
58 | int chrSrcHSubSample, chrSrcVSubSample; | ||
59 | int chrIntHSubSample, chrIntVSubSample; | ||
60 | int chrDstHSubSample, chrDstVSubSample; | ||
61 | int vChrDrop; | ||
62 | int sliceDir; | ||
63 | double param[2]; | ||
64 | |||
65 | int16_t **lumPixBuf; | ||
66 | int16_t **chrPixBuf; | ||
67 | int16_t *hLumFilter; | ||
68 | int16_t *hLumFilterPos; | ||
69 | int16_t *hChrFilter; | ||
70 | int16_t *hChrFilterPos; | ||
71 | int16_t *vLumFilter; | ||
72 | int16_t *vLumFilterPos; | ||
73 | int16_t *vChrFilter; | ||
74 | int16_t *vChrFilterPos; | ||
75 | |||
76 | uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful | ||
77 | |||
78 | int hLumFilterSize; | ||
79 | int hChrFilterSize; | ||
80 | int vLumFilterSize; | ||
81 | int vChrFilterSize; | ||
82 | int vLumBufSize; | ||
83 | int vChrBufSize; | ||
84 | |||
85 | uint8_t *funnyYCode; | ||
86 | uint8_t *funnyUVCode; | ||
87 | int32_t *lumMmx2FilterPos; | ||
88 | int32_t *chrMmx2FilterPos; | ||
89 | int16_t *lumMmx2Filter; | ||
90 | int16_t *chrMmx2Filter; | ||
91 | |||
92 | int canMMX2BeUsed; | ||
93 | |||
94 | int lastInLumBuf; | ||
95 | int lastInChrBuf; | ||
96 | int lumBufIndex; | ||
97 | int chrBufIndex; | ||
98 | int dstY; | ||
99 | int flags; | ||
100 | void * yuvTable; // pointer to the yuv->rgb table start so it can be freed() | ||
101 | uint8_t * table_rV[256]; | ||
102 | uint8_t * table_gU[256]; | ||
103 | int table_gV[256]; | ||
104 | uint8_t * table_bU[256]; | ||
105 | |||
106 | //Colorspace stuff | ||
107 | int contrast, brightness, saturation; // for sws_getColorspaceDetails | ||
108 | int srcColorspaceTable[4]; | ||
109 | int dstColorspaceTable[4]; | ||
110 | int srcRange, dstRange; | ||
111 | |||
112 | #define RED_DITHER "0*8" | ||
113 | #define GREEN_DITHER "1*8" | ||
114 | #define BLUE_DITHER "2*8" | ||
115 | #define Y_COEFF "3*8" | ||
116 | #define VR_COEFF "4*8" | ||
117 | #define UB_COEFF "5*8" | ||
118 | #define VG_COEFF "6*8" | ||
119 | #define UG_COEFF "7*8" | ||
120 | #define Y_OFFSET "8*8" | ||
121 | #define U_OFFSET "9*8" | ||
122 | #define V_OFFSET "10*8" | ||
123 | #define LUM_MMX_FILTER_OFFSET "11*8" | ||
124 | #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256" | ||
125 | #define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM | ||
126 | #define ESP_OFFSET "11*8+4*4*256*2+8" | ||
127 | #define VROUNDER_OFFSET "11*8+4*4*256*2+16" | ||
128 | #define U_TEMP "11*8+4*4*256*2+24" | ||
129 | #define V_TEMP "11*8+4*4*256*2+32" | ||
130 | |||
131 | uint64_t redDither __attribute__((aligned(8))); | ||
132 | uint64_t greenDither __attribute__((aligned(8))); | ||
133 | uint64_t blueDither __attribute__((aligned(8))); | ||
134 | |||
135 | uint64_t yCoeff __attribute__((aligned(8))); | ||
136 | uint64_t vrCoeff __attribute__((aligned(8))); | ||
137 | uint64_t ubCoeff __attribute__((aligned(8))); | ||
138 | uint64_t vgCoeff __attribute__((aligned(8))); | ||
139 | uint64_t ugCoeff __attribute__((aligned(8))); | ||
140 | uint64_t yOffset __attribute__((aligned(8))); | ||
141 | uint64_t uOffset __attribute__((aligned(8))); | ||
142 | uint64_t vOffset __attribute__((aligned(8))); | ||
143 | int32_t lumMmxFilter[4*MAX_FILTER_SIZE]; | ||
144 | int32_t chrMmxFilter[4*MAX_FILTER_SIZE]; | ||
145 | int dstW; | ||
146 | uint64_t esp __attribute__((aligned(8))); | ||
147 | uint64_t vRounder __attribute__((aligned(8))); | ||
148 | uint64_t u_temp __attribute__((aligned(8))); | ||
149 | uint64_t v_temp __attribute__((aligned(8))); | ||
150 | |||
151 | #ifdef HAVE_ALTIVEC | ||
152 | |||
153 | vector signed short CY; | ||
154 | vector signed short CRV; | ||
155 | vector signed short CBU; | ||
156 | vector signed short CGU; | ||
157 | vector signed short CGV; | ||
158 | vector signed short OY; | ||
159 | vector unsigned short CSHIFT; | ||
160 | vector signed short *vYCoeffsBank, *vCCoeffsBank; | ||
161 | |||
162 | #endif | ||
163 | |||
164 | |||
165 | #ifdef ARCH_BFIN | ||
166 | uint32_t oy __attribute__((aligned(4))); | ||
167 | uint32_t oc __attribute__((aligned(4))); | ||
168 | uint32_t zero __attribute__((aligned(4))); | ||
169 | uint32_t cy __attribute__((aligned(4))); | ||
170 | uint32_t crv __attribute__((aligned(4))); | ||
171 | uint32_t rmask __attribute__((aligned(4))); | ||
172 | uint32_t cbu __attribute__((aligned(4))); | ||
173 | uint32_t bmask __attribute__((aligned(4))); | ||
174 | uint32_t cgu __attribute__((aligned(4))); | ||
175 | uint32_t cgv __attribute__((aligned(4))); | ||
176 | uint32_t gmask __attribute__((aligned(4))); | ||
177 | #endif | ||
178 | |||
179 | #ifdef HAVE_VIS | ||
180 | uint64_t sparc_coeffs[10] __attribute__((aligned(8))); | ||
181 | #endif | ||
182 | |||
183 | } SwsContext; | ||
184 | //FIXME check init (where 0) | ||
185 | |||
186 | SwsFunc yuv2rgb_get_func_ptr (SwsContext *c); | ||
187 | int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation); | ||
188 | |||
189 | void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation); | ||
190 | SwsFunc yuv2rgb_init_altivec (SwsContext *c); | ||
191 | void altivec_yuv2packedX (SwsContext *c, | ||
192 | int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
193 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
194 | uint8_t *dest, int dstW, int dstY); | ||
195 | |||
196 | const char *sws_format_name(int format); | ||
197 | |||
198 | //FIXME replace this with something faster | ||
199 | #define isPlanarYUV(x) ( \ | ||
200 | (x)==PIX_FMT_YUV410P \ | ||
201 | || (x)==PIX_FMT_YUV420P \ | ||
202 | || (x)==PIX_FMT_YUV411P \ | ||
203 | || (x)==PIX_FMT_YUV422P \ | ||
204 | || (x)==PIX_FMT_YUV444P \ | ||
205 | || (x)==PIX_FMT_YUV440P \ | ||
206 | || (x)==PIX_FMT_NV12 \ | ||
207 | || (x)==PIX_FMT_NV21 \ | ||
208 | ) | ||
209 | #define isYUV(x) ( \ | ||
210 | (x)==PIX_FMT_UYVY422 \ | ||
211 | || (x)==PIX_FMT_YUYV422 \ | ||
212 | || isPlanarYUV(x) \ | ||
213 | ) | ||
214 | #define isGray(x) ( \ | ||
215 | (x)==PIX_FMT_GRAY8 \ | ||
216 | || (x)==PIX_FMT_GRAY16BE \ | ||
217 | || (x)==PIX_FMT_GRAY16LE \ | ||
218 | ) | ||
219 | #define isGray16(x) ( \ | ||
220 | (x)==PIX_FMT_GRAY16BE \ | ||
221 | || (x)==PIX_FMT_GRAY16LE \ | ||
222 | ) | ||
223 | #define isRGB(x) ( \ | ||
224 | (x)==PIX_FMT_BGR32 \ | ||
225 | || (x)==PIX_FMT_RGB24 \ | ||
226 | || (x)==PIX_FMT_RGB565 \ | ||
227 | || (x)==PIX_FMT_RGB555 \ | ||
228 | || (x)==PIX_FMT_RGB8 \ | ||
229 | || (x)==PIX_FMT_RGB4 \ | ||
230 | || (x)==PIX_FMT_RGB4_BYTE \ | ||
231 | || (x)==PIX_FMT_MONOBLACK \ | ||
232 | ) | ||
233 | #define isBGR(x) ( \ | ||
234 | (x)==PIX_FMT_RGB32 \ | ||
235 | || (x)==PIX_FMT_BGR24 \ | ||
236 | || (x)==PIX_FMT_BGR565 \ | ||
237 | || (x)==PIX_FMT_BGR555 \ | ||
238 | || (x)==PIX_FMT_BGR8 \ | ||
239 | || (x)==PIX_FMT_BGR4 \ | ||
240 | || (x)==PIX_FMT_BGR4_BYTE \ | ||
241 | || (x)==PIX_FMT_MONOBLACK \ | ||
242 | ) | ||
243 | |||
244 | static inline int fmt_depth(int fmt) | ||
245 | { | ||
246 | switch(fmt) { | ||
247 | case PIX_FMT_BGRA: | ||
248 | case PIX_FMT_ABGR: | ||
249 | case PIX_FMT_RGBA: | ||
250 | case PIX_FMT_ARGB: | ||
251 | return 32; | ||
252 | case PIX_FMT_BGR24: | ||
253 | case PIX_FMT_RGB24: | ||
254 | return 24; | ||
255 | case PIX_FMT_BGR565: | ||
256 | case PIX_FMT_RGB565: | ||
257 | case PIX_FMT_GRAY16BE: | ||
258 | case PIX_FMT_GRAY16LE: | ||
259 | return 16; | ||
260 | case PIX_FMT_BGR555: | ||
261 | case PIX_FMT_RGB555: | ||
262 | return 15; | ||
263 | case PIX_FMT_BGR8: | ||
264 | case PIX_FMT_RGB8: | ||
265 | return 8; | ||
266 | case PIX_FMT_BGR4: | ||
267 | case PIX_FMT_RGB4: | ||
268 | case PIX_FMT_BGR4_BYTE: | ||
269 | case PIX_FMT_RGB4_BYTE: | ||
270 | return 4; | ||
271 | case PIX_FMT_MONOBLACK: | ||
272 | return 1; | ||
273 | default: | ||
274 | return 0; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | extern const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]); | ||
279 | extern const DECLARE_ALIGNED(8, uint64_t, ff_dither8[2]); | ||
280 | |||
281 | extern const AVClass sws_context_class; | ||
282 | |||
283 | #endif /* FFMPEG_SWSCALE_INTERNAL_H */ | ||
diff --git a/src/plugins/ffmpeg/libswscale/swscale_template.c b/src/plugins/ffmpeg/libswscale/swscale_template.c deleted file mode 100644 index 1280ba6..0000000 --- a/src/plugins/ffmpeg/libswscale/swscale_template.c +++ /dev/null | |||
@@ -1,3295 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | * | ||
20 | * The C code (not assembly, MMX, ...) of this file can be used | ||
21 | * under the LGPL license. | ||
22 | */ | ||
23 | |||
24 | #undef REAL_MOVNTQ | ||
25 | #undef MOVNTQ | ||
26 | #undef PAVGB | ||
27 | #undef PREFETCH | ||
28 | #undef PREFETCHW | ||
29 | #undef EMMS | ||
30 | #undef SFENCE | ||
31 | |||
32 | #ifdef HAVE_3DNOW | ||
33 | /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */ | ||
34 | #define EMMS "femms" | ||
35 | #else | ||
36 | #define EMMS "emms" | ||
37 | #endif | ||
38 | |||
39 | #ifdef HAVE_3DNOW | ||
40 | #define PREFETCH "prefetch" | ||
41 | #define PREFETCHW "prefetchw" | ||
42 | #elif defined (HAVE_MMX2) | ||
43 | #define PREFETCH "prefetchnta" | ||
44 | #define PREFETCHW "prefetcht0" | ||
45 | #else | ||
46 | #define PREFETCH " # nop" | ||
47 | #define PREFETCHW " # nop" | ||
48 | #endif | ||
49 | |||
50 | #ifdef HAVE_MMX2 | ||
51 | #define SFENCE "sfence" | ||
52 | #else | ||
53 | #define SFENCE " # nop" | ||
54 | #endif | ||
55 | |||
56 | #ifdef HAVE_MMX2 | ||
57 | #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | ||
58 | #elif defined (HAVE_3DNOW) | ||
59 | #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | ||
60 | #endif | ||
61 | |||
62 | #ifdef HAVE_MMX2 | ||
63 | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" | ||
64 | #else | ||
65 | #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" | ||
66 | #endif | ||
67 | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) | ||
68 | |||
69 | #ifdef HAVE_ALTIVEC | ||
70 | #include "swscale_altivec_template.c" | ||
71 | #endif | ||
72 | |||
73 | #define YSCALEYUV2YV12X(x, offset, dest, width) \ | ||
74 | asm volatile(\ | ||
75 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
76 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | ||
77 | "movq %%mm3, %%mm4 \n\t"\ | ||
78 | "lea " offset "(%0), %%"REG_d" \n\t"\ | ||
79 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
80 | ASMALIGN(4) /* FIXME Unroll? */\ | ||
81 | "1: \n\t"\ | ||
82 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | ||
83 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ | ||
84 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ | ||
85 | "add $16, %%"REG_d" \n\t"\ | ||
86 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
87 | "test %%"REG_S", %%"REG_S" \n\t"\ | ||
88 | "pmulhw %%mm0, %%mm2 \n\t"\ | ||
89 | "pmulhw %%mm0, %%mm5 \n\t"\ | ||
90 | "paddw %%mm2, %%mm3 \n\t"\ | ||
91 | "paddw %%mm5, %%mm4 \n\t"\ | ||
92 | " jnz 1b \n\t"\ | ||
93 | "psraw $3, %%mm3 \n\t"\ | ||
94 | "psraw $3, %%mm4 \n\t"\ | ||
95 | "packuswb %%mm4, %%mm3 \n\t"\ | ||
96 | MOVNTQ(%%mm3, (%1, %%REGa))\ | ||
97 | "add $8, %%"REG_a" \n\t"\ | ||
98 | "cmp %2, %%"REG_a" \n\t"\ | ||
99 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | ||
100 | "movq %%mm3, %%mm4 \n\t"\ | ||
101 | "lea " offset "(%0), %%"REG_d" \n\t"\ | ||
102 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
103 | "jb 1b \n\t"\ | ||
104 | :: "r" (&c->redDither),\ | ||
105 | "r" (dest), "g" (width)\ | ||
106 | : "%"REG_a, "%"REG_d, "%"REG_S\ | ||
107 | ); | ||
108 | |||
109 | #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ | ||
110 | asm volatile(\ | ||
111 | "lea " offset "(%0), %%"REG_d" \n\t"\ | ||
112 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
113 | "pxor %%mm4, %%mm4 \n\t"\ | ||
114 | "pxor %%mm5, %%mm5 \n\t"\ | ||
115 | "pxor %%mm6, %%mm6 \n\t"\ | ||
116 | "pxor %%mm7, %%mm7 \n\t"\ | ||
117 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
118 | ASMALIGN(4) \ | ||
119 | "1: \n\t"\ | ||
120 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ | ||
121 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ | ||
122 | "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ | ||
123 | "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ | ||
124 | "movq %%mm0, %%mm3 \n\t"\ | ||
125 | "punpcklwd %%mm1, %%mm0 \n\t"\ | ||
126 | "punpckhwd %%mm1, %%mm3 \n\t"\ | ||
127 | "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ | ||
128 | "pmaddwd %%mm1, %%mm0 \n\t"\ | ||
129 | "pmaddwd %%mm1, %%mm3 \n\t"\ | ||
130 | "paddd %%mm0, %%mm4 \n\t"\ | ||
131 | "paddd %%mm3, %%mm5 \n\t"\ | ||
132 | "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ | ||
133 | "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ | ||
134 | "add $16, %%"REG_d" \n\t"\ | ||
135 | "test %%"REG_S", %%"REG_S" \n\t"\ | ||
136 | "movq %%mm2, %%mm0 \n\t"\ | ||
137 | "punpcklwd %%mm3, %%mm2 \n\t"\ | ||
138 | "punpckhwd %%mm3, %%mm0 \n\t"\ | ||
139 | "pmaddwd %%mm1, %%mm2 \n\t"\ | ||
140 | "pmaddwd %%mm1, %%mm0 \n\t"\ | ||
141 | "paddd %%mm2, %%mm6 \n\t"\ | ||
142 | "paddd %%mm0, %%mm7 \n\t"\ | ||
143 | " jnz 1b \n\t"\ | ||
144 | "psrad $16, %%mm4 \n\t"\ | ||
145 | "psrad $16, %%mm5 \n\t"\ | ||
146 | "psrad $16, %%mm6 \n\t"\ | ||
147 | "psrad $16, %%mm7 \n\t"\ | ||
148 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | ||
149 | "packssdw %%mm5, %%mm4 \n\t"\ | ||
150 | "packssdw %%mm7, %%mm6 \n\t"\ | ||
151 | "paddw %%mm0, %%mm4 \n\t"\ | ||
152 | "paddw %%mm0, %%mm6 \n\t"\ | ||
153 | "psraw $3, %%mm4 \n\t"\ | ||
154 | "psraw $3, %%mm6 \n\t"\ | ||
155 | "packuswb %%mm6, %%mm4 \n\t"\ | ||
156 | MOVNTQ(%%mm4, (%1, %%REGa))\ | ||
157 | "add $8, %%"REG_a" \n\t"\ | ||
158 | "cmp %2, %%"REG_a" \n\t"\ | ||
159 | "lea " offset "(%0), %%"REG_d" \n\t"\ | ||
160 | "pxor %%mm4, %%mm4 \n\t"\ | ||
161 | "pxor %%mm5, %%mm5 \n\t"\ | ||
162 | "pxor %%mm6, %%mm6 \n\t"\ | ||
163 | "pxor %%mm7, %%mm7 \n\t"\ | ||
164 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
165 | "jb 1b \n\t"\ | ||
166 | :: "r" (&c->redDither),\ | ||
167 | "r" (dest), "g" (width)\ | ||
168 | : "%"REG_a, "%"REG_d, "%"REG_S\ | ||
169 | ); | ||
170 | |||
171 | #define YSCALEYUV2YV121 \ | ||
172 | "mov %2, %%"REG_a" \n\t"\ | ||
173 | ASMALIGN(4) /* FIXME Unroll? */\ | ||
174 | "1: \n\t"\ | ||
175 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | ||
176 | "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ | ||
177 | "psraw $7, %%mm0 \n\t"\ | ||
178 | "psraw $7, %%mm1 \n\t"\ | ||
179 | "packuswb %%mm1, %%mm0 \n\t"\ | ||
180 | MOVNTQ(%%mm0, (%1, %%REGa))\ | ||
181 | "add $8, %%"REG_a" \n\t"\ | ||
182 | "jnc 1b \n\t" | ||
183 | |||
184 | #define YSCALEYUV2YV121_ACCURATE \ | ||
185 | "mov %2, %%"REG_a" \n\t"\ | ||
186 | "pcmpeqw %%mm7, %%mm7 \n\t"\ | ||
187 | "psrlw $15, %%mm7 \n\t"\ | ||
188 | "psllw $6, %%mm7 \n\t"\ | ||
189 | ASMALIGN(4) /* FIXME Unroll? */\ | ||
190 | "1: \n\t"\ | ||
191 | "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\ | ||
192 | "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\ | ||
193 | "paddw %%mm7, %%mm0 \n\t"\ | ||
194 | "paddw %%mm7, %%mm1 \n\t"\ | ||
195 | "psraw $7, %%mm0 \n\t"\ | ||
196 | "psraw $7, %%mm1 \n\t"\ | ||
197 | "packuswb %%mm1, %%mm0 \n\t"\ | ||
198 | MOVNTQ(%%mm0, (%1, %%REGa))\ | ||
199 | "add $8, %%"REG_a" \n\t"\ | ||
200 | "jnc 1b \n\t" | ||
201 | |||
202 | /* | ||
203 | :: "m" (-lumFilterSize), "m" (-chrFilterSize), | ||
204 | "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4), | ||
205 | "r" (dest), "m" (dstW), | ||
206 | "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize) | ||
207 | : "%eax", "%ebx", "%ecx", "%edx", "%esi" | ||
208 | */ | ||
209 | #define YSCALEYUV2PACKEDX \ | ||
210 | asm volatile(\ | ||
211 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
212 | ASMALIGN(4)\ | ||
213 | "nop \n\t"\ | ||
214 | "1: \n\t"\ | ||
215 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | ||
216 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
217 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | ||
218 | "movq %%mm3, %%mm4 \n\t"\ | ||
219 | ASMALIGN(4)\ | ||
220 | "2: \n\t"\ | ||
221 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | ||
222 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | ||
223 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | ||
224 | "add $16, %%"REG_d" \n\t"\ | ||
225 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
226 | "pmulhw %%mm0, %%mm2 \n\t"\ | ||
227 | "pmulhw %%mm0, %%mm5 \n\t"\ | ||
228 | "paddw %%mm2, %%mm3 \n\t"\ | ||
229 | "paddw %%mm5, %%mm4 \n\t"\ | ||
230 | "test %%"REG_S", %%"REG_S" \n\t"\ | ||
231 | " jnz 2b \n\t"\ | ||
232 | \ | ||
233 | "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | ||
234 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
235 | "movq "VROUNDER_OFFSET"(%0), %%mm1 \n\t"\ | ||
236 | "movq %%mm1, %%mm7 \n\t"\ | ||
237 | ASMALIGN(4)\ | ||
238 | "2: \n\t"\ | ||
239 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | ||
240 | "movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\ | ||
241 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* Y2srcData */\ | ||
242 | "add $16, %%"REG_d" \n\t"\ | ||
243 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
244 | "pmulhw %%mm0, %%mm2 \n\t"\ | ||
245 | "pmulhw %%mm0, %%mm5 \n\t"\ | ||
246 | "paddw %%mm2, %%mm1 \n\t"\ | ||
247 | "paddw %%mm5, %%mm7 \n\t"\ | ||
248 | "test %%"REG_S", %%"REG_S" \n\t"\ | ||
249 | " jnz 2b \n\t"\ | ||
250 | |||
251 | #define YSCALEYUV2PACKEDX_END \ | ||
252 | :: "r" (&c->redDither), \ | ||
253 | "m" (dummy), "m" (dummy), "m" (dummy),\ | ||
254 | "r" (dest), "m" (dstW) \ | ||
255 | : "%"REG_a, "%"REG_d, "%"REG_S \ | ||
256 | ); | ||
257 | |||
258 | #define YSCALEYUV2PACKEDX_ACCURATE \ | ||
259 | asm volatile(\ | ||
260 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
261 | ASMALIGN(4)\ | ||
262 | "nop \n\t"\ | ||
263 | "1: \n\t"\ | ||
264 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | ||
265 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
266 | "pxor %%mm4, %%mm4 \n\t"\ | ||
267 | "pxor %%mm5, %%mm5 \n\t"\ | ||
268 | "pxor %%mm6, %%mm6 \n\t"\ | ||
269 | "pxor %%mm7, %%mm7 \n\t"\ | ||
270 | ASMALIGN(4)\ | ||
271 | "2: \n\t"\ | ||
272 | "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ | ||
273 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ | ||
274 | "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ | ||
275 | "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ | ||
276 | "movq %%mm0, %%mm3 \n\t"\ | ||
277 | "punpcklwd %%mm1, %%mm0 \n\t"\ | ||
278 | "punpckhwd %%mm1, %%mm3 \n\t"\ | ||
279 | "movq 8(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ | ||
280 | "pmaddwd %%mm1, %%mm0 \n\t"\ | ||
281 | "pmaddwd %%mm1, %%mm3 \n\t"\ | ||
282 | "paddd %%mm0, %%mm4 \n\t"\ | ||
283 | "paddd %%mm3, %%mm5 \n\t"\ | ||
284 | "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ | ||
285 | "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ | ||
286 | "add $16, %%"REG_d" \n\t"\ | ||
287 | "test %%"REG_S", %%"REG_S" \n\t"\ | ||
288 | "movq %%mm2, %%mm0 \n\t"\ | ||
289 | "punpcklwd %%mm3, %%mm2 \n\t"\ | ||
290 | "punpckhwd %%mm3, %%mm0 \n\t"\ | ||
291 | "pmaddwd %%mm1, %%mm2 \n\t"\ | ||
292 | "pmaddwd %%mm1, %%mm0 \n\t"\ | ||
293 | "paddd %%mm2, %%mm6 \n\t"\ | ||
294 | "paddd %%mm0, %%mm7 \n\t"\ | ||
295 | " jnz 2b \n\t"\ | ||
296 | "psrad $16, %%mm4 \n\t"\ | ||
297 | "psrad $16, %%mm5 \n\t"\ | ||
298 | "psrad $16, %%mm6 \n\t"\ | ||
299 | "psrad $16, %%mm7 \n\t"\ | ||
300 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | ||
301 | "packssdw %%mm5, %%mm4 \n\t"\ | ||
302 | "packssdw %%mm7, %%mm6 \n\t"\ | ||
303 | "paddw %%mm0, %%mm4 \n\t"\ | ||
304 | "paddw %%mm0, %%mm6 \n\t"\ | ||
305 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ | ||
306 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ | ||
307 | \ | ||
308 | "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | ||
309 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ | ||
310 | "pxor %%mm1, %%mm1 \n\t"\ | ||
311 | "pxor %%mm5, %%mm5 \n\t"\ | ||
312 | "pxor %%mm7, %%mm7 \n\t"\ | ||
313 | "pxor %%mm6, %%mm6 \n\t"\ | ||
314 | ASMALIGN(4)\ | ||
315 | "2: \n\t"\ | ||
316 | "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | ||
317 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | ||
318 | "mov 4(%%"REG_d"), %%"REG_S" \n\t"\ | ||
319 | "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ | ||
320 | "movq %%mm0, %%mm3 \n\t"\ | ||
321 | "punpcklwd %%mm4, %%mm0 \n\t"\ | ||
322 | "punpckhwd %%mm4, %%mm3 \n\t"\ | ||
323 | "movq 8(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ | ||
324 | "pmaddwd %%mm4, %%mm0 \n\t"\ | ||
325 | "pmaddwd %%mm4, %%mm3 \n\t"\ | ||
326 | "paddd %%mm0, %%mm1 \n\t"\ | ||
327 | "paddd %%mm3, %%mm5 \n\t"\ | ||
328 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | ||
329 | "mov 16(%%"REG_d"), %%"REG_S" \n\t"\ | ||
330 | "add $16, %%"REG_d" \n\t"\ | ||
331 | "test %%"REG_S", %%"REG_S" \n\t"\ | ||
332 | "movq %%mm2, %%mm0 \n\t"\ | ||
333 | "punpcklwd %%mm3, %%mm2 \n\t"\ | ||
334 | "punpckhwd %%mm3, %%mm0 \n\t"\ | ||
335 | "pmaddwd %%mm4, %%mm2 \n\t"\ | ||
336 | "pmaddwd %%mm4, %%mm0 \n\t"\ | ||
337 | "paddd %%mm2, %%mm7 \n\t"\ | ||
338 | "paddd %%mm0, %%mm6 \n\t"\ | ||
339 | " jnz 2b \n\t"\ | ||
340 | "psrad $16, %%mm1 \n\t"\ | ||
341 | "psrad $16, %%mm5 \n\t"\ | ||
342 | "psrad $16, %%mm7 \n\t"\ | ||
343 | "psrad $16, %%mm6 \n\t"\ | ||
344 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ | ||
345 | "packssdw %%mm5, %%mm1 \n\t"\ | ||
346 | "packssdw %%mm6, %%mm7 \n\t"\ | ||
347 | "paddw %%mm0, %%mm1 \n\t"\ | ||
348 | "paddw %%mm0, %%mm7 \n\t"\ | ||
349 | "movq "U_TEMP"(%0), %%mm3 \n\t"\ | ||
350 | "movq "V_TEMP"(%0), %%mm4 \n\t"\ | ||
351 | |||
352 | #define YSCALEYUV2RGBX \ | ||
353 | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ | ||
354 | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ | ||
355 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | ||
356 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | ||
357 | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ | ||
358 | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ | ||
359 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | ||
360 | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ | ||
361 | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ | ||
362 | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ | ||
363 | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ | ||
364 | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ | ||
365 | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ | ||
366 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | ||
367 | "paddw %%mm3, %%mm4 \n\t"\ | ||
368 | "movq %%mm2, %%mm0 \n\t"\ | ||
369 | "movq %%mm5, %%mm6 \n\t"\ | ||
370 | "movq %%mm4, %%mm3 \n\t"\ | ||
371 | "punpcklwd %%mm2, %%mm2 \n\t"\ | ||
372 | "punpcklwd %%mm5, %%mm5 \n\t"\ | ||
373 | "punpcklwd %%mm4, %%mm4 \n\t"\ | ||
374 | "paddw %%mm1, %%mm2 \n\t"\ | ||
375 | "paddw %%mm1, %%mm5 \n\t"\ | ||
376 | "paddw %%mm1, %%mm4 \n\t"\ | ||
377 | "punpckhwd %%mm0, %%mm0 \n\t"\ | ||
378 | "punpckhwd %%mm6, %%mm6 \n\t"\ | ||
379 | "punpckhwd %%mm3, %%mm3 \n\t"\ | ||
380 | "paddw %%mm7, %%mm0 \n\t"\ | ||
381 | "paddw %%mm7, %%mm6 \n\t"\ | ||
382 | "paddw %%mm7, %%mm3 \n\t"\ | ||
383 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | ||
384 | "packuswb %%mm0, %%mm2 \n\t"\ | ||
385 | "packuswb %%mm6, %%mm5 \n\t"\ | ||
386 | "packuswb %%mm3, %%mm4 \n\t"\ | ||
387 | "pxor %%mm7, %%mm7 \n\t" | ||
388 | #if 0 | ||
389 | #define FULL_YSCALEYUV2RGB \ | ||
390 | "pxor %%mm7, %%mm7 \n\t"\ | ||
391 | "movd %6, %%mm6 \n\t" /*yalpha1*/\ | ||
392 | "punpcklwd %%mm6, %%mm6 \n\t"\ | ||
393 | "punpcklwd %%mm6, %%mm6 \n\t"\ | ||
394 | "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | ||
395 | "punpcklwd %%mm5, %%mm5 \n\t"\ | ||
396 | "punpcklwd %%mm5, %%mm5 \n\t"\ | ||
397 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
398 | ASMALIGN(4)\ | ||
399 | "1: \n\t"\ | ||
400 | "movq (%0, %%"REG_a",2), %%mm0 \n\t" /*buf0[eax]*/\ | ||
401 | "movq (%1, %%"REG_a",2), %%mm1 \n\t" /*buf1[eax]*/\ | ||
402 | "movq (%2, %%"REG_a",2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||
403 | "movq (%3, %%"REG_a",2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||
404 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | ||
405 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | ||
406 | "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | ||
407 | "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | ||
408 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
409 | "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | ||
410 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | ||
411 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | ||
412 | "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | ||
413 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | ||
414 | "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | ||
415 | "psubw "MANGLE(w80)", %%mm1 \n\t" /* 8(Y-16)*/\ | ||
416 | "psubw "MANGLE(w400)", %%mm3 \n\t" /* 8(U-128)*/\ | ||
417 | "pmulhw "MANGLE(yCoeff)", %%mm1 \n\t"\ | ||
418 | \ | ||
419 | \ | ||
420 | "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | ||
421 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | ||
422 | "pmulhw "MANGLE(ubCoeff)", %%mm3 \n\t"\ | ||
423 | "psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | ||
424 | "pmulhw "MANGLE(ugCoeff)", %%mm2 \n\t"\ | ||
425 | "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | ||
426 | "psubw "MANGLE(w400)", %%mm0 \n\t" /* (V-128)8*/\ | ||
427 | \ | ||
428 | \ | ||
429 | "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | ||
430 | "pmulhw "MANGLE(vrCoeff)", %%mm0 \n\t"\ | ||
431 | "pmulhw "MANGLE(vgCoeff)", %%mm4 \n\t"\ | ||
432 | "paddw %%mm1, %%mm3 \n\t" /* B*/\ | ||
433 | "paddw %%mm1, %%mm0 \n\t" /* R*/\ | ||
434 | "packuswb %%mm3, %%mm3 \n\t"\ | ||
435 | \ | ||
436 | "packuswb %%mm0, %%mm0 \n\t"\ | ||
437 | "paddw %%mm4, %%mm2 \n\t"\ | ||
438 | "paddw %%mm2, %%mm1 \n\t" /* G*/\ | ||
439 | \ | ||
440 | "packuswb %%mm1, %%mm1 \n\t" | ||
441 | #endif | ||
442 | |||
443 | #define REAL_YSCALEYUV2PACKED(index, c) \ | ||
444 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | ||
445 | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ | ||
446 | "psraw $3, %%mm0 \n\t"\ | ||
447 | "psraw $3, %%mm1 \n\t"\ | ||
448 | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | ||
449 | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ | ||
450 | "xor "#index", "#index" \n\t"\ | ||
451 | ASMALIGN(4)\ | ||
452 | "1: \n\t"\ | ||
453 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||
454 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||
455 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | ||
456 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | ||
457 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | ||
458 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | ||
459 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | ||
460 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | ||
461 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | ||
462 | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | ||
463 | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | ||
464 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | ||
465 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | ||
466 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | ||
467 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | ||
468 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | ||
469 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | ||
470 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | ||
471 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | ||
472 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | ||
473 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | ||
474 | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
475 | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
476 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | ||
477 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | ||
478 | |||
479 | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) | ||
480 | |||
481 | #define REAL_YSCALEYUV2RGB(index, c) \ | ||
482 | "xor "#index", "#index" \n\t"\ | ||
483 | ASMALIGN(4)\ | ||
484 | "1: \n\t"\ | ||
485 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||
486 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||
487 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | ||
488 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | ||
489 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | ||
490 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | ||
491 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ | ||
492 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | ||
493 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | ||
494 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | ||
495 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | ||
496 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | ||
497 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | ||
498 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | ||
499 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | ||
500 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | ||
501 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | ||
502 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | ||
503 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | ||
504 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | ||
505 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ | ||
506 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ | ||
507 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ | ||
508 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ | ||
509 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | ||
510 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ | ||
511 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | ||
512 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | ||
513 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
514 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
515 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | ||
516 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | ||
517 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | ||
518 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | ||
519 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | ||
520 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | ||
521 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | ||
522 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | ||
523 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | ||
524 | "paddw %%mm3, %%mm4 \n\t"\ | ||
525 | "movq %%mm2, %%mm0 \n\t"\ | ||
526 | "movq %%mm5, %%mm6 \n\t"\ | ||
527 | "movq %%mm4, %%mm3 \n\t"\ | ||
528 | "punpcklwd %%mm2, %%mm2 \n\t"\ | ||
529 | "punpcklwd %%mm5, %%mm5 \n\t"\ | ||
530 | "punpcklwd %%mm4, %%mm4 \n\t"\ | ||
531 | "paddw %%mm1, %%mm2 \n\t"\ | ||
532 | "paddw %%mm1, %%mm5 \n\t"\ | ||
533 | "paddw %%mm1, %%mm4 \n\t"\ | ||
534 | "punpckhwd %%mm0, %%mm0 \n\t"\ | ||
535 | "punpckhwd %%mm6, %%mm6 \n\t"\ | ||
536 | "punpckhwd %%mm3, %%mm3 \n\t"\ | ||
537 | "paddw %%mm7, %%mm0 \n\t"\ | ||
538 | "paddw %%mm7, %%mm6 \n\t"\ | ||
539 | "paddw %%mm7, %%mm3 \n\t"\ | ||
540 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | ||
541 | "packuswb %%mm0, %%mm2 \n\t"\ | ||
542 | "packuswb %%mm6, %%mm5 \n\t"\ | ||
543 | "packuswb %%mm3, %%mm4 \n\t"\ | ||
544 | "pxor %%mm7, %%mm7 \n\t" | ||
545 | #define YSCALEYUV2RGB(index, c) REAL_YSCALEYUV2RGB(index, c) | ||
546 | |||
547 | #define REAL_YSCALEYUV2PACKED1(index, c) \ | ||
548 | "xor "#index", "#index" \n\t"\ | ||
549 | ASMALIGN(4)\ | ||
550 | "1: \n\t"\ | ||
551 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | ||
552 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | ||
553 | "psraw $7, %%mm3 \n\t" \ | ||
554 | "psraw $7, %%mm4 \n\t" \ | ||
555 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | ||
556 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | ||
557 | "psraw $7, %%mm1 \n\t" \ | ||
558 | "psraw $7, %%mm7 \n\t" \ | ||
559 | |||
560 | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) | ||
561 | |||
562 | #define REAL_YSCALEYUV2RGB1(index, c) \ | ||
563 | "xor "#index", "#index" \n\t"\ | ||
564 | ASMALIGN(4)\ | ||
565 | "1: \n\t"\ | ||
566 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ | ||
567 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | ||
568 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ | ||
569 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ | ||
570 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | ||
571 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | ||
572 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | ||
573 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | ||
574 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | ||
575 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | ||
576 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | ||
577 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | ||
578 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | ||
579 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
580 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
581 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | ||
582 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | ||
583 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | ||
584 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | ||
585 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | ||
586 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | ||
587 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | ||
588 | "paddw %%mm3, %%mm4 \n\t"\ | ||
589 | "movq %%mm2, %%mm0 \n\t"\ | ||
590 | "movq %%mm5, %%mm6 \n\t"\ | ||
591 | "movq %%mm4, %%mm3 \n\t"\ | ||
592 | "punpcklwd %%mm2, %%mm2 \n\t"\ | ||
593 | "punpcklwd %%mm5, %%mm5 \n\t"\ | ||
594 | "punpcklwd %%mm4, %%mm4 \n\t"\ | ||
595 | "paddw %%mm1, %%mm2 \n\t"\ | ||
596 | "paddw %%mm1, %%mm5 \n\t"\ | ||
597 | "paddw %%mm1, %%mm4 \n\t"\ | ||
598 | "punpckhwd %%mm0, %%mm0 \n\t"\ | ||
599 | "punpckhwd %%mm6, %%mm6 \n\t"\ | ||
600 | "punpckhwd %%mm3, %%mm3 \n\t"\ | ||
601 | "paddw %%mm7, %%mm0 \n\t"\ | ||
602 | "paddw %%mm7, %%mm6 \n\t"\ | ||
603 | "paddw %%mm7, %%mm3 \n\t"\ | ||
604 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | ||
605 | "packuswb %%mm0, %%mm2 \n\t"\ | ||
606 | "packuswb %%mm6, %%mm5 \n\t"\ | ||
607 | "packuswb %%mm3, %%mm4 \n\t"\ | ||
608 | "pxor %%mm7, %%mm7 \n\t" | ||
609 | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) | ||
610 | |||
611 | #define REAL_YSCALEYUV2PACKED1b(index, c) \ | ||
612 | "xor "#index", "#index" \n\t"\ | ||
613 | ASMALIGN(4)\ | ||
614 | "1: \n\t"\ | ||
615 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||
616 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||
617 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | ||
618 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | ||
619 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | ||
620 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | ||
621 | "psrlw $8, %%mm3 \n\t" \ | ||
622 | "psrlw $8, %%mm4 \n\t" \ | ||
623 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | ||
624 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | ||
625 | "psraw $7, %%mm1 \n\t" \ | ||
626 | "psraw $7, %%mm7 \n\t" | ||
627 | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) | ||
628 | |||
629 | // do vertical chrominance interpolation | ||
630 | #define REAL_YSCALEYUV2RGB1b(index, c) \ | ||
631 | "xor "#index", "#index" \n\t"\ | ||
632 | ASMALIGN(4)\ | ||
633 | "1: \n\t"\ | ||
634 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||
635 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||
636 | "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ | ||
637 | "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ | ||
638 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ | ||
639 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ | ||
640 | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ | ||
641 | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ | ||
642 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ | ||
643 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ | ||
644 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | ||
645 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ | ||
646 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ | ||
647 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ | ||
648 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ | ||
649 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ | ||
650 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ | ||
651 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
652 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ | ||
653 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ | ||
654 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ | ||
655 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ | ||
656 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ | ||
657 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ | ||
658 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ | ||
659 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ | ||
660 | "paddw %%mm3, %%mm4 \n\t"\ | ||
661 | "movq %%mm2, %%mm0 \n\t"\ | ||
662 | "movq %%mm5, %%mm6 \n\t"\ | ||
663 | "movq %%mm4, %%mm3 \n\t"\ | ||
664 | "punpcklwd %%mm2, %%mm2 \n\t"\ | ||
665 | "punpcklwd %%mm5, %%mm5 \n\t"\ | ||
666 | "punpcklwd %%mm4, %%mm4 \n\t"\ | ||
667 | "paddw %%mm1, %%mm2 \n\t"\ | ||
668 | "paddw %%mm1, %%mm5 \n\t"\ | ||
669 | "paddw %%mm1, %%mm4 \n\t"\ | ||
670 | "punpckhwd %%mm0, %%mm0 \n\t"\ | ||
671 | "punpckhwd %%mm6, %%mm6 \n\t"\ | ||
672 | "punpckhwd %%mm3, %%mm3 \n\t"\ | ||
673 | "paddw %%mm7, %%mm0 \n\t"\ | ||
674 | "paddw %%mm7, %%mm6 \n\t"\ | ||
675 | "paddw %%mm7, %%mm3 \n\t"\ | ||
676 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ | ||
677 | "packuswb %%mm0, %%mm2 \n\t"\ | ||
678 | "packuswb %%mm6, %%mm5 \n\t"\ | ||
679 | "packuswb %%mm3, %%mm4 \n\t"\ | ||
680 | "pxor %%mm7, %%mm7 \n\t" | ||
681 | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) | ||
682 | |||
683 | #define REAL_WRITEBGR32(dst, dstw, index) \ | ||
684 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | ||
685 | "movq %%mm2, %%mm1 \n\t" /* B */\ | ||
686 | "movq %%mm5, %%mm6 \n\t" /* R */\ | ||
687 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | ||
688 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | ||
689 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | ||
690 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | ||
691 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | ||
692 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | ||
693 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | ||
694 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | ||
695 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | ||
696 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | ||
697 | \ | ||
698 | MOVNTQ(%%mm0, (dst, index, 4))\ | ||
699 | MOVNTQ(%%mm2, 8(dst, index, 4))\ | ||
700 | MOVNTQ(%%mm1, 16(dst, index, 4))\ | ||
701 | MOVNTQ(%%mm3, 24(dst, index, 4))\ | ||
702 | \ | ||
703 | "add $8, "#index" \n\t"\ | ||
704 | "cmp "#dstw", "#index" \n\t"\ | ||
705 | " jb 1b \n\t" | ||
706 | #define WRITEBGR32(dst, dstw, index) REAL_WRITEBGR32(dst, dstw, index) | ||
707 | |||
708 | #define REAL_WRITEBGR16(dst, dstw, index) \ | ||
709 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | ||
710 | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ | ||
711 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | ||
712 | "psrlq $3, %%mm2 \n\t"\ | ||
713 | \ | ||
714 | "movq %%mm2, %%mm1 \n\t"\ | ||
715 | "movq %%mm4, %%mm3 \n\t"\ | ||
716 | \ | ||
717 | "punpcklbw %%mm7, %%mm3 \n\t"\ | ||
718 | "punpcklbw %%mm5, %%mm2 \n\t"\ | ||
719 | "punpckhbw %%mm7, %%mm4 \n\t"\ | ||
720 | "punpckhbw %%mm5, %%mm1 \n\t"\ | ||
721 | \ | ||
722 | "psllq $3, %%mm3 \n\t"\ | ||
723 | "psllq $3, %%mm4 \n\t"\ | ||
724 | \ | ||
725 | "por %%mm3, %%mm2 \n\t"\ | ||
726 | "por %%mm4, %%mm1 \n\t"\ | ||
727 | \ | ||
728 | MOVNTQ(%%mm2, (dst, index, 2))\ | ||
729 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | ||
730 | \ | ||
731 | "add $8, "#index" \n\t"\ | ||
732 | "cmp "#dstw", "#index" \n\t"\ | ||
733 | " jb 1b \n\t" | ||
734 | #define WRITEBGR16(dst, dstw, index) REAL_WRITEBGR16(dst, dstw, index) | ||
735 | |||
736 | #define REAL_WRITEBGR15(dst, dstw, index) \ | ||
737 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ | ||
738 | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ | ||
739 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ | ||
740 | "psrlq $3, %%mm2 \n\t"\ | ||
741 | "psrlq $1, %%mm5 \n\t"\ | ||
742 | \ | ||
743 | "movq %%mm2, %%mm1 \n\t"\ | ||
744 | "movq %%mm4, %%mm3 \n\t"\ | ||
745 | \ | ||
746 | "punpcklbw %%mm7, %%mm3 \n\t"\ | ||
747 | "punpcklbw %%mm5, %%mm2 \n\t"\ | ||
748 | "punpckhbw %%mm7, %%mm4 \n\t"\ | ||
749 | "punpckhbw %%mm5, %%mm1 \n\t"\ | ||
750 | \ | ||
751 | "psllq $2, %%mm3 \n\t"\ | ||
752 | "psllq $2, %%mm4 \n\t"\ | ||
753 | \ | ||
754 | "por %%mm3, %%mm2 \n\t"\ | ||
755 | "por %%mm4, %%mm1 \n\t"\ | ||
756 | \ | ||
757 | MOVNTQ(%%mm2, (dst, index, 2))\ | ||
758 | MOVNTQ(%%mm1, 8(dst, index, 2))\ | ||
759 | \ | ||
760 | "add $8, "#index" \n\t"\ | ||
761 | "cmp "#dstw", "#index" \n\t"\ | ||
762 | " jb 1b \n\t" | ||
763 | #define WRITEBGR15(dst, dstw, index) REAL_WRITEBGR15(dst, dstw, index) | ||
764 | |||
765 | #define WRITEBGR24OLD(dst, dstw, index) \ | ||
766 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | ||
767 | "movq %%mm2, %%mm1 \n\t" /* B */\ | ||
768 | "movq %%mm5, %%mm6 \n\t" /* R */\ | ||
769 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | ||
770 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | ||
771 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | ||
772 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | ||
773 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | ||
774 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | ||
775 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | ||
776 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | ||
777 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | ||
778 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | ||
779 | \ | ||
780 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | ||
781 | "psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ | ||
782 | "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 0 */\ | ||
783 | "pand "MANGLE(bm11111000)", %%mm0 \n\t" /* 00RGB000 0.5 */\ | ||
784 | "por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ | ||
785 | "movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ | ||
786 | "psllq $48, %%mm2 \n\t" /* GB000000 1 */\ | ||
787 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | ||
788 | \ | ||
789 | "movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | ||
790 | "psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ | ||
791 | "psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ | ||
792 | "por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ | ||
793 | "pand "MANGLE(bm00001111)", %%mm2 \n\t" /* 0000RGBR 1 */\ | ||
794 | "movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ | ||
795 | "psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ | ||
796 | "pand "MANGLE(bm00000111)", %%mm4 \n\t" /* 00000RGB 2 */\ | ||
797 | "pand "MANGLE(bm11111000)", %%mm1 \n\t" /* 00RGB000 2.5 */\ | ||
798 | "por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ | ||
799 | "movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ | ||
800 | "psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ | ||
801 | "por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ | ||
802 | \ | ||
803 | "psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ | ||
804 | "movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ | ||
805 | "psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ | ||
806 | "pand "MANGLE(bm00000111)", %%mm5 \n\t" /* 00000RGB 3 */\ | ||
807 | "pand "MANGLE(bm11111000)", %%mm3 \n\t" /* 00RGB000 3.5 */\ | ||
808 | "por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ | ||
809 | "psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ | ||
810 | "por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ | ||
811 | \ | ||
812 | MOVNTQ(%%mm0, (dst))\ | ||
813 | MOVNTQ(%%mm2, 8(dst))\ | ||
814 | MOVNTQ(%%mm3, 16(dst))\ | ||
815 | "add $24, "#dst" \n\t"\ | ||
816 | \ | ||
817 | "add $8, "#index" \n\t"\ | ||
818 | "cmp "#dstw", "#index" \n\t"\ | ||
819 | " jb 1b \n\t" | ||
820 | |||
821 | #define WRITEBGR24MMX(dst, dstw, index) \ | ||
822 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | ||
823 | "movq %%mm2, %%mm1 \n\t" /* B */\ | ||
824 | "movq %%mm5, %%mm6 \n\t" /* R */\ | ||
825 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ | ||
826 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ | ||
827 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ | ||
828 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ | ||
829 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ | ||
830 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ | ||
831 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ | ||
832 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ | ||
833 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ | ||
834 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ | ||
835 | \ | ||
836 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ | ||
837 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ | ||
838 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ | ||
839 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ | ||
840 | \ | ||
841 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ | ||
842 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ | ||
843 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ | ||
844 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ | ||
845 | \ | ||
846 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ | ||
847 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ | ||
848 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ | ||
849 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ | ||
850 | \ | ||
851 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ | ||
852 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ | ||
853 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ | ||
854 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ | ||
855 | MOVNTQ(%%mm0, (dst))\ | ||
856 | \ | ||
857 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ | ||
858 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ | ||
859 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ | ||
860 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ | ||
861 | MOVNTQ(%%mm6, 8(dst))\ | ||
862 | \ | ||
863 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ | ||
864 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ | ||
865 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ | ||
866 | MOVNTQ(%%mm5, 16(dst))\ | ||
867 | \ | ||
868 | "add $24, "#dst" \n\t"\ | ||
869 | \ | ||
870 | "add $8, "#index" \n\t"\ | ||
871 | "cmp "#dstw", "#index" \n\t"\ | ||
872 | " jb 1b \n\t" | ||
873 | |||
874 | #define WRITEBGR24MMX2(dst, dstw, index) \ | ||
875 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ | ||
876 | "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ | ||
877 | "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ | ||
878 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ | ||
879 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ | ||
880 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ | ||
881 | \ | ||
882 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ | ||
883 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ | ||
884 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ | ||
885 | \ | ||
886 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ | ||
887 | "por %%mm1, %%mm6 \n\t"\ | ||
888 | "por %%mm3, %%mm6 \n\t"\ | ||
889 | MOVNTQ(%%mm6, (dst))\ | ||
890 | \ | ||
891 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ | ||
892 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ | ||
893 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ | ||
894 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ | ||
895 | \ | ||
896 | "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ | ||
897 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ | ||
898 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ | ||
899 | \ | ||
900 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ | ||
901 | "por %%mm3, %%mm6 \n\t"\ | ||
902 | MOVNTQ(%%mm6, 8(dst))\ | ||
903 | \ | ||
904 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ | ||
905 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ | ||
906 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ | ||
907 | \ | ||
908 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ | ||
909 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ | ||
910 | "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ | ||
911 | \ | ||
912 | "por %%mm1, %%mm3 \n\t"\ | ||
913 | "por %%mm3, %%mm6 \n\t"\ | ||
914 | MOVNTQ(%%mm6, 16(dst))\ | ||
915 | \ | ||
916 | "add $24, "#dst" \n\t"\ | ||
917 | \ | ||
918 | "add $8, "#index" \n\t"\ | ||
919 | "cmp "#dstw", "#index" \n\t"\ | ||
920 | " jb 1b \n\t" | ||
921 | |||
922 | #ifdef HAVE_MMX2 | ||
923 | #undef WRITEBGR24 | ||
924 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) | ||
925 | #else | ||
926 | #undef WRITEBGR24 | ||
927 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) | ||
928 | #endif | ||
929 | |||
930 | #define REAL_WRITEYUY2(dst, dstw, index) \ | ||
931 | "packuswb %%mm3, %%mm3 \n\t"\ | ||
932 | "packuswb %%mm4, %%mm4 \n\t"\ | ||
933 | "packuswb %%mm7, %%mm1 \n\t"\ | ||
934 | "punpcklbw %%mm4, %%mm3 \n\t"\ | ||
935 | "movq %%mm1, %%mm7 \n\t"\ | ||
936 | "punpcklbw %%mm3, %%mm1 \n\t"\ | ||
937 | "punpckhbw %%mm3, %%mm7 \n\t"\ | ||
938 | \ | ||
939 | MOVNTQ(%%mm1, (dst, index, 2))\ | ||
940 | MOVNTQ(%%mm7, 8(dst, index, 2))\ | ||
941 | \ | ||
942 | "add $8, "#index" \n\t"\ | ||
943 | "cmp "#dstw", "#index" \n\t"\ | ||
944 | " jb 1b \n\t" | ||
945 | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) | ||
946 | |||
947 | |||
948 | static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
949 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
950 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | ||
951 | { | ||
952 | #ifdef HAVE_MMX | ||
953 | if (c->flags & SWS_ACCURATE_RND){ | ||
954 | if (uDest){ | ||
955 | YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) | ||
956 | YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | ||
957 | } | ||
958 | |||
959 | YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) | ||
960 | }else{ | ||
961 | if (uDest){ | ||
962 | YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) | ||
963 | YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) | ||
964 | } | ||
965 | |||
966 | YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW) | ||
967 | } | ||
968 | #else | ||
969 | #ifdef HAVE_ALTIVEC | ||
970 | yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, | ||
971 | chrFilter, chrSrc, chrFilterSize, | ||
972 | dest, uDest, vDest, dstW, chrDstW); | ||
973 | #else //HAVE_ALTIVEC | ||
974 | yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, | ||
975 | chrFilter, chrSrc, chrFilterSize, | ||
976 | dest, uDest, vDest, dstW, chrDstW); | ||
977 | #endif //!HAVE_ALTIVEC | ||
978 | #endif /* HAVE_MMX */ | ||
979 | } | ||
980 | |||
981 | static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
982 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
983 | uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) | ||
984 | { | ||
985 | yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, | ||
986 | chrFilter, chrSrc, chrFilterSize, | ||
987 | dest, uDest, dstW, chrDstW, dstFormat); | ||
988 | } | ||
989 | |||
990 | static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, | ||
991 | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW) | ||
992 | { | ||
993 | #ifdef HAVE_MMX | ||
994 | long p= uDest ? 3 : 1; | ||
995 | uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; | ||
996 | uint8_t *dst[3]= {dest, uDest, vDest}; | ||
997 | long counter[3] = {dstW, chrDstW, chrDstW}; | ||
998 | |||
999 | if (c->flags & SWS_ACCURATE_RND){ | ||
1000 | while(p--){ | ||
1001 | asm volatile( | ||
1002 | YSCALEYUV2YV121_ACCURATE | ||
1003 | :: "r" (src[p]), "r" (dst[p] + counter[p]), | ||
1004 | "g" (-counter[p]) | ||
1005 | : "%"REG_a | ||
1006 | ); | ||
1007 | } | ||
1008 | }else{ | ||
1009 | while(p--){ | ||
1010 | asm volatile( | ||
1011 | YSCALEYUV2YV121 | ||
1012 | :: "r" (src[p]), "r" (dst[p] + counter[p]), | ||
1013 | "g" (-counter[p]) | ||
1014 | : "%"REG_a | ||
1015 | ); | ||
1016 | } | ||
1017 | } | ||
1018 | |||
1019 | #else | ||
1020 | int i; | ||
1021 | for (i=0; i<dstW; i++) | ||
1022 | { | ||
1023 | int val= (lumSrc[i]+64)>>7; | ||
1024 | |||
1025 | if (val&256){ | ||
1026 | if (val<0) val=0; | ||
1027 | else val=255; | ||
1028 | } | ||
1029 | |||
1030 | dest[i]= val; | ||
1031 | } | ||
1032 | |||
1033 | if (uDest) | ||
1034 | for (i=0; i<chrDstW; i++) | ||
1035 | { | ||
1036 | int u=(chrSrc[i ]+64)>>7; | ||
1037 | int v=(chrSrc[i + VOFW]+64)>>7; | ||
1038 | |||
1039 | if ((u|v)&256){ | ||
1040 | if (u<0) u=0; | ||
1041 | else if (u>255) u=255; | ||
1042 | if (v<0) v=0; | ||
1043 | else if (v>255) v=255; | ||
1044 | } | ||
1045 | |||
1046 | uDest[i]= u; | ||
1047 | vDest[i]= v; | ||
1048 | } | ||
1049 | #endif | ||
1050 | } | ||
1051 | |||
1052 | |||
1053 | /** | ||
1054 | * vertical scale YV12 to RGB | ||
1055 | */ | ||
1056 | static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
1057 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
1058 | uint8_t *dest, long dstW, long dstY) | ||
1059 | { | ||
1060 | #ifdef HAVE_MMX | ||
1061 | long dummy=0; | ||
1062 | if (c->flags & SWS_ACCURATE_RND){ | ||
1063 | switch(c->dstFormat){ | ||
1064 | case PIX_FMT_RGB32: | ||
1065 | YSCALEYUV2PACKEDX_ACCURATE | ||
1066 | YSCALEYUV2RGBX | ||
1067 | WRITEBGR32(%4, %5, %%REGa) | ||
1068 | |||
1069 | YSCALEYUV2PACKEDX_END | ||
1070 | return; | ||
1071 | case PIX_FMT_BGR24: | ||
1072 | YSCALEYUV2PACKEDX_ACCURATE | ||
1073 | YSCALEYUV2RGBX | ||
1074 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize | ||
1075 | "add %4, %%"REG_c" \n\t" | ||
1076 | WRITEBGR24(%%REGc, %5, %%REGa) | ||
1077 | |||
1078 | |||
1079 | :: "r" (&c->redDither), | ||
1080 | "m" (dummy), "m" (dummy), "m" (dummy), | ||
1081 | "r" (dest), "m" (dstW) | ||
1082 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | ||
1083 | ); | ||
1084 | return; | ||
1085 | case PIX_FMT_BGR555: | ||
1086 | YSCALEYUV2PACKEDX_ACCURATE | ||
1087 | YSCALEYUV2RGBX | ||
1088 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1089 | #ifdef DITHER1XBPP | ||
1090 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | ||
1091 | "paddusb "MANGLE(g5Dither)", %%mm4\n\t" | ||
1092 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | ||
1093 | #endif | ||
1094 | |||
1095 | WRITEBGR15(%4, %5, %%REGa) | ||
1096 | YSCALEYUV2PACKEDX_END | ||
1097 | return; | ||
1098 | case PIX_FMT_BGR565: | ||
1099 | YSCALEYUV2PACKEDX_ACCURATE | ||
1100 | YSCALEYUV2RGBX | ||
1101 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1102 | #ifdef DITHER1XBPP | ||
1103 | "paddusb "MANGLE(b5Dither)", %%mm2\n\t" | ||
1104 | "paddusb "MANGLE(g6Dither)", %%mm4\n\t" | ||
1105 | "paddusb "MANGLE(r5Dither)", %%mm5\n\t" | ||
1106 | #endif | ||
1107 | |||
1108 | WRITEBGR16(%4, %5, %%REGa) | ||
1109 | YSCALEYUV2PACKEDX_END | ||
1110 | return; | ||
1111 | case PIX_FMT_YUYV422: | ||
1112 | YSCALEYUV2PACKEDX_ACCURATE | ||
1113 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1114 | |||
1115 | "psraw $3, %%mm3 \n\t" | ||
1116 | "psraw $3, %%mm4 \n\t" | ||
1117 | "psraw $3, %%mm1 \n\t" | ||
1118 | "psraw $3, %%mm7 \n\t" | ||
1119 | WRITEYUY2(%4, %5, %%REGa) | ||
1120 | YSCALEYUV2PACKEDX_END | ||
1121 | return; | ||
1122 | } | ||
1123 | }else{ | ||
1124 | switch(c->dstFormat) | ||
1125 | { | ||
1126 | case PIX_FMT_RGB32: | ||
1127 | YSCALEYUV2PACKEDX | ||
1128 | YSCALEYUV2RGBX | ||
1129 | WRITEBGR32(%4, %5, %%REGa) | ||
1130 | YSCALEYUV2PACKEDX_END | ||
1131 | return; | ||
1132 | case PIX_FMT_BGR24: | ||
1133 | YSCALEYUV2PACKEDX | ||
1134 | YSCALEYUV2RGBX | ||
1135 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize | ||
1136 | "add %4, %%"REG_c" \n\t" | ||
1137 | WRITEBGR24(%%REGc, %5, %%REGa) | ||
1138 | |||
1139 | :: "r" (&c->redDither), | ||
1140 | "m" (dummy), "m" (dummy), "m" (dummy), | ||
1141 | "r" (dest), "m" (dstW) | ||
1142 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | ||
1143 | ); | ||
1144 | return; | ||
1145 | case PIX_FMT_BGR555: | ||
1146 | YSCALEYUV2PACKEDX | ||
1147 | YSCALEYUV2RGBX | ||
1148 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1149 | #ifdef DITHER1XBPP | ||
1150 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1151 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | ||
1152 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1153 | #endif | ||
1154 | |||
1155 | WRITEBGR15(%4, %5, %%REGa) | ||
1156 | YSCALEYUV2PACKEDX_END | ||
1157 | return; | ||
1158 | case PIX_FMT_BGR565: | ||
1159 | YSCALEYUV2PACKEDX | ||
1160 | YSCALEYUV2RGBX | ||
1161 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1162 | #ifdef DITHER1XBPP | ||
1163 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1164 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | ||
1165 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1166 | #endif | ||
1167 | |||
1168 | WRITEBGR16(%4, %5, %%REGa) | ||
1169 | YSCALEYUV2PACKEDX_END | ||
1170 | return; | ||
1171 | case PIX_FMT_YUYV422: | ||
1172 | YSCALEYUV2PACKEDX | ||
1173 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1174 | |||
1175 | "psraw $3, %%mm3 \n\t" | ||
1176 | "psraw $3, %%mm4 \n\t" | ||
1177 | "psraw $3, %%mm1 \n\t" | ||
1178 | "psraw $3, %%mm7 \n\t" | ||
1179 | WRITEYUY2(%4, %5, %%REGa) | ||
1180 | YSCALEYUV2PACKEDX_END | ||
1181 | return; | ||
1182 | } | ||
1183 | } | ||
1184 | #endif /* HAVE_MMX */ | ||
1185 | #ifdef HAVE_ALTIVEC | ||
1186 | /* The following list of supported dstFormat values should | ||
1187 | match what's found in the body of altivec_yuv2packedX() */ | ||
1188 | if (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || | ||
1189 | c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || | ||
1190 | c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB) | ||
1191 | altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize, | ||
1192 | chrFilter, chrSrc, chrFilterSize, | ||
1193 | dest, dstW, dstY); | ||
1194 | else | ||
1195 | #endif | ||
1196 | yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize, | ||
1197 | chrFilter, chrSrc, chrFilterSize, | ||
1198 | dest, dstW, dstY); | ||
1199 | } | ||
1200 | |||
1201 | /** | ||
1202 | * vertical bilinear scale YV12 to RGB | ||
1203 | */ | ||
1204 | static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, | ||
1205 | uint8_t *dest, int dstW, int yalpha, int uvalpha, int y) | ||
1206 | { | ||
1207 | int yalpha1=yalpha^4095; | ||
1208 | int uvalpha1=uvalpha^4095; | ||
1209 | int i; | ||
1210 | |||
1211 | #if 0 //isn't used | ||
1212 | if (flags&SWS_FULL_CHR_H_INT) | ||
1213 | { | ||
1214 | switch(dstFormat) | ||
1215 | { | ||
1216 | #ifdef HAVE_MMX | ||
1217 | case PIX_FMT_RGB32: | ||
1218 | asm volatile( | ||
1219 | |||
1220 | |||
1221 | FULL_YSCALEYUV2RGB | ||
1222 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | ||
1223 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | ||
1224 | |||
1225 | "movq %%mm3, %%mm1 \n\t" | ||
1226 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | ||
1227 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | ||
1228 | |||
1229 | MOVNTQ(%%mm3, (%4, %%REGa, 4)) | ||
1230 | MOVNTQ(%%mm1, 8(%4, %%REGa, 4)) | ||
1231 | |||
1232 | "add $4, %%"REG_a" \n\t" | ||
1233 | "cmp %5, %%"REG_a" \n\t" | ||
1234 | " jb 1b \n\t" | ||
1235 | |||
1236 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW), | ||
1237 | "m" (yalpha1), "m" (uvalpha1) | ||
1238 | : "%"REG_a | ||
1239 | ); | ||
1240 | break; | ||
1241 | case PIX_FMT_BGR24: | ||
1242 | asm volatile( | ||
1243 | |||
1244 | FULL_YSCALEYUV2RGB | ||
1245 | |||
1246 | // lsb ... msb | ||
1247 | "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | ||
1248 | "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | ||
1249 | |||
1250 | "movq %%mm3, %%mm1 \n\t" | ||
1251 | "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | ||
1252 | "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | ||
1253 | |||
1254 | "movq %%mm3, %%mm2 \n\t" // BGR0BGR0 | ||
1255 | "psrlq $8, %%mm3 \n\t" // GR0BGR00 | ||
1256 | "pand "MANGLE(bm00000111)", %%mm2 \n\t" // BGR00000 | ||
1257 | "pand "MANGLE(bm11111000)", %%mm3 \n\t" // 000BGR00 | ||
1258 | "por %%mm2, %%mm3 \n\t" // BGRBGR00 | ||
1259 | "movq %%mm1, %%mm2 \n\t" | ||
1260 | "psllq $48, %%mm1 \n\t" // 000000BG | ||
1261 | "por %%mm1, %%mm3 \n\t" // BGRBGRBG | ||
1262 | |||
1263 | "movq %%mm2, %%mm1 \n\t" // BGR0BGR0 | ||
1264 | "psrld $16, %%mm2 \n\t" // R000R000 | ||
1265 | "psrlq $24, %%mm1 \n\t" // 0BGR0000 | ||
1266 | "por %%mm2, %%mm1 \n\t" // RBGRR000 | ||
1267 | |||
1268 | "mov %4, %%"REG_b" \n\t" | ||
1269 | "add %%"REG_a", %%"REG_b" \n\t" | ||
1270 | |||
1271 | #ifdef HAVE_MMX2 | ||
1272 | //FIXME Alignment | ||
1273 | "movntq %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | ||
1274 | "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | ||
1275 | #else | ||
1276 | "movd %%mm3, (%%"REG_b", %%"REG_a", 2) \n\t" | ||
1277 | "psrlq $32, %%mm3 \n\t" | ||
1278 | "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2) \n\t" | ||
1279 | "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2) \n\t" | ||
1280 | #endif | ||
1281 | "add $4, %%"REG_a" \n\t" | ||
1282 | "cmp %5, %%"REG_a" \n\t" | ||
1283 | " jb 1b \n\t" | ||
1284 | |||
1285 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW), | ||
1286 | "m" (yalpha1), "m" (uvalpha1) | ||
1287 | : "%"REG_a, "%"REG_b | ||
1288 | ); | ||
1289 | break; | ||
1290 | case PIX_FMT_BGR555: | ||
1291 | asm volatile( | ||
1292 | |||
1293 | FULL_YSCALEYUV2RGB | ||
1294 | #ifdef DITHER1XBPP | ||
1295 | "paddusb "MANGLE(g5Dither)", %%mm1 \n\t" | ||
1296 | "paddusb "MANGLE(r5Dither)", %%mm0 \n\t" | ||
1297 | "paddusb "MANGLE(b5Dither)", %%mm3 \n\t" | ||
1298 | #endif | ||
1299 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | ||
1300 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | ||
1301 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | ||
1302 | |||
1303 | "psrlw $3, %%mm3 \n\t" | ||
1304 | "psllw $2, %%mm1 \n\t" | ||
1305 | "psllw $7, %%mm0 \n\t" | ||
1306 | "pand "MANGLE(g15Mask)", %%mm1 \n\t" | ||
1307 | "pand "MANGLE(r15Mask)", %%mm0 \n\t" | ||
1308 | |||
1309 | "por %%mm3, %%mm1 \n\t" | ||
1310 | "por %%mm1, %%mm0 \n\t" | ||
1311 | |||
1312 | MOVNTQ(%%mm0, (%4, %%REGa, 2)) | ||
1313 | |||
1314 | "add $4, %%"REG_a" \n\t" | ||
1315 | "cmp %5, %%"REG_a" \n\t" | ||
1316 | " jb 1b \n\t" | ||
1317 | |||
1318 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | ||
1319 | "m" (yalpha1), "m" (uvalpha1) | ||
1320 | : "%"REG_a | ||
1321 | ); | ||
1322 | break; | ||
1323 | case PIX_FMT_BGR565: | ||
1324 | asm volatile( | ||
1325 | |||
1326 | FULL_YSCALEYUV2RGB | ||
1327 | #ifdef DITHER1XBPP | ||
1328 | "paddusb "MANGLE(g6Dither)", %%mm1 \n\t" | ||
1329 | "paddusb "MANGLE(r5Dither)", %%mm0 \n\t" | ||
1330 | "paddusb "MANGLE(b5Dither)", %%mm3 \n\t" | ||
1331 | #endif | ||
1332 | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | ||
1333 | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | ||
1334 | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | ||
1335 | |||
1336 | "psrlw $3, %%mm3 \n\t" | ||
1337 | "psllw $3, %%mm1 \n\t" | ||
1338 | "psllw $8, %%mm0 \n\t" | ||
1339 | "pand "MANGLE(g16Mask)", %%mm1 \n\t" | ||
1340 | "pand "MANGLE(r16Mask)", %%mm0 \n\t" | ||
1341 | |||
1342 | "por %%mm3, %%mm1 \n\t" | ||
1343 | "por %%mm1, %%mm0 \n\t" | ||
1344 | |||
1345 | MOVNTQ(%%mm0, (%4, %%REGa, 2)) | ||
1346 | |||
1347 | "add $4, %%"REG_a" \n\t" | ||
1348 | "cmp %5, %%"REG_a" \n\t" | ||
1349 | " jb 1b \n\t" | ||
1350 | |||
1351 | :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW), | ||
1352 | "m" (yalpha1), "m" (uvalpha1) | ||
1353 | : "%"REG_a | ||
1354 | ); | ||
1355 | break; | ||
1356 | #endif /* HAVE_MMX */ | ||
1357 | case PIX_FMT_BGR32: | ||
1358 | #ifndef HAVE_MMX | ||
1359 | case PIX_FMT_RGB32: | ||
1360 | #endif | ||
1361 | if (dstFormat==PIX_FMT_RGB32) | ||
1362 | { | ||
1363 | int i; | ||
1364 | #ifdef WORDS_BIGENDIAN | ||
1365 | dest++; | ||
1366 | #endif | ||
1367 | for (i=0;i<dstW;i++){ | ||
1368 | // vertical linear interpolation && yuv2rgb in a single step: | ||
1369 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | ||
1370 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | ||
1371 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); | ||
1372 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | ||
1373 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | ||
1374 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | ||
1375 | dest+= 4; | ||
1376 | } | ||
1377 | } | ||
1378 | else if (dstFormat==PIX_FMT_BGR24) | ||
1379 | { | ||
1380 | int i; | ||
1381 | for (i=0;i<dstW;i++){ | ||
1382 | // vertical linear interpolation && yuv2rgb in a single step: | ||
1383 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | ||
1384 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | ||
1385 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); | ||
1386 | dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)]; | ||
1387 | dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)]; | ||
1388 | dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)]; | ||
1389 | dest+= 3; | ||
1390 | } | ||
1391 | } | ||
1392 | else if (dstFormat==PIX_FMT_BGR565) | ||
1393 | { | ||
1394 | int i; | ||
1395 | for (i=0;i<dstW;i++){ | ||
1396 | // vertical linear interpolation && yuv2rgb in a single step: | ||
1397 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | ||
1398 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | ||
1399 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); | ||
1400 | |||
1401 | ((uint16_t*)dest)[i] = | ||
1402 | clip_table16b[(Y + yuvtab_40cf[U]) >>13] | | ||
1403 | clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | ||
1404 | clip_table16r[(Y + yuvtab_3343[V]) >>13]; | ||
1405 | } | ||
1406 | } | ||
1407 | else if (dstFormat==PIX_FMT_BGR555) | ||
1408 | { | ||
1409 | int i; | ||
1410 | for (i=0;i<dstW;i++){ | ||
1411 | // vertical linear interpolation && yuv2rgb in a single step: | ||
1412 | int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; | ||
1413 | int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); | ||
1414 | int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19); | ||
1415 | |||
1416 | ((uint16_t*)dest)[i] = | ||
1417 | clip_table15b[(Y + yuvtab_40cf[U]) >>13] | | ||
1418 | clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] | | ||
1419 | clip_table15r[(Y + yuvtab_3343[V]) >>13]; | ||
1420 | } | ||
1421 | } | ||
1422 | }//FULL_UV_IPOL | ||
1423 | else | ||
1424 | { | ||
1425 | #endif // if 0 | ||
1426 | #ifdef HAVE_MMX | ||
1427 | switch(c->dstFormat) | ||
1428 | { | ||
1429 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( | ||
1430 | case PIX_FMT_RGB32: | ||
1431 | asm volatile( | ||
1432 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1433 | "mov %4, %%"REG_b" \n\t" | ||
1434 | "push %%"REG_BP" \n\t" | ||
1435 | YSCALEYUV2RGB(%%REGBP, %5) | ||
1436 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | ||
1437 | "pop %%"REG_BP" \n\t" | ||
1438 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1439 | |||
1440 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1441 | "a" (&c->redDither) | ||
1442 | ); | ||
1443 | return; | ||
1444 | case PIX_FMT_BGR24: | ||
1445 | asm volatile( | ||
1446 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1447 | "mov %4, %%"REG_b" \n\t" | ||
1448 | "push %%"REG_BP" \n\t" | ||
1449 | YSCALEYUV2RGB(%%REGBP, %5) | ||
1450 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | ||
1451 | "pop %%"REG_BP" \n\t" | ||
1452 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1453 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1454 | "a" (&c->redDither) | ||
1455 | ); | ||
1456 | return; | ||
1457 | case PIX_FMT_BGR555: | ||
1458 | asm volatile( | ||
1459 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1460 | "mov %4, %%"REG_b" \n\t" | ||
1461 | "push %%"REG_BP" \n\t" | ||
1462 | YSCALEYUV2RGB(%%REGBP, %5) | ||
1463 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1464 | #ifdef DITHER1XBPP | ||
1465 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1466 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | ||
1467 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1468 | #endif | ||
1469 | |||
1470 | WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | ||
1471 | "pop %%"REG_BP" \n\t" | ||
1472 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1473 | |||
1474 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1475 | "a" (&c->redDither) | ||
1476 | ); | ||
1477 | return; | ||
1478 | case PIX_FMT_BGR565: | ||
1479 | asm volatile( | ||
1480 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1481 | "mov %4, %%"REG_b" \n\t" | ||
1482 | "push %%"REG_BP" \n\t" | ||
1483 | YSCALEYUV2RGB(%%REGBP, %5) | ||
1484 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1485 | #ifdef DITHER1XBPP | ||
1486 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1487 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | ||
1488 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1489 | #endif | ||
1490 | |||
1491 | WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | ||
1492 | "pop %%"REG_BP" \n\t" | ||
1493 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1494 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1495 | "a" (&c->redDither) | ||
1496 | ); | ||
1497 | return; | ||
1498 | case PIX_FMT_YUYV422: | ||
1499 | asm volatile( | ||
1500 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1501 | "mov %4, %%"REG_b" \n\t" | ||
1502 | "push %%"REG_BP" \n\t" | ||
1503 | YSCALEYUV2PACKED(%%REGBP, %5) | ||
1504 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | ||
1505 | "pop %%"REG_BP" \n\t" | ||
1506 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1507 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1508 | "a" (&c->redDither) | ||
1509 | ); | ||
1510 | return; | ||
1511 | default: break; | ||
1512 | } | ||
1513 | #endif //HAVE_MMX | ||
1514 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C) | ||
1515 | } | ||
1516 | |||
1517 | /** | ||
1518 | * YV12 to RGB without scaling or interpolating | ||
1519 | */ | ||
1520 | static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1, | ||
1521 | uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) | ||
1522 | { | ||
1523 | const int yalpha1=0; | ||
1524 | int i; | ||
1525 | |||
1526 | uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 | ||
1527 | const int yalpha= 4096; //FIXME ... | ||
1528 | |||
1529 | if (flags&SWS_FULL_CHR_H_INT) | ||
1530 | { | ||
1531 | RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y); | ||
1532 | return; | ||
1533 | } | ||
1534 | |||
1535 | #ifdef HAVE_MMX | ||
1536 | if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | ||
1537 | { | ||
1538 | switch(dstFormat) | ||
1539 | { | ||
1540 | case PIX_FMT_RGB32: | ||
1541 | asm volatile( | ||
1542 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1543 | "mov %4, %%"REG_b" \n\t" | ||
1544 | "push %%"REG_BP" \n\t" | ||
1545 | YSCALEYUV2RGB1(%%REGBP, %5) | ||
1546 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | ||
1547 | "pop %%"REG_BP" \n\t" | ||
1548 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1549 | |||
1550 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1551 | "a" (&c->redDither) | ||
1552 | ); | ||
1553 | return; | ||
1554 | case PIX_FMT_BGR24: | ||
1555 | asm volatile( | ||
1556 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1557 | "mov %4, %%"REG_b" \n\t" | ||
1558 | "push %%"REG_BP" \n\t" | ||
1559 | YSCALEYUV2RGB1(%%REGBP, %5) | ||
1560 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | ||
1561 | "pop %%"REG_BP" \n\t" | ||
1562 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1563 | |||
1564 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1565 | "a" (&c->redDither) | ||
1566 | ); | ||
1567 | return; | ||
1568 | case PIX_FMT_BGR555: | ||
1569 | asm volatile( | ||
1570 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1571 | "mov %4, %%"REG_b" \n\t" | ||
1572 | "push %%"REG_BP" \n\t" | ||
1573 | YSCALEYUV2RGB1(%%REGBP, %5) | ||
1574 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1575 | #ifdef DITHER1XBPP | ||
1576 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1577 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | ||
1578 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1579 | #endif | ||
1580 | WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | ||
1581 | "pop %%"REG_BP" \n\t" | ||
1582 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1583 | |||
1584 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1585 | "a" (&c->redDither) | ||
1586 | ); | ||
1587 | return; | ||
1588 | case PIX_FMT_BGR565: | ||
1589 | asm volatile( | ||
1590 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1591 | "mov %4, %%"REG_b" \n\t" | ||
1592 | "push %%"REG_BP" \n\t" | ||
1593 | YSCALEYUV2RGB1(%%REGBP, %5) | ||
1594 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1595 | #ifdef DITHER1XBPP | ||
1596 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1597 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | ||
1598 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1599 | #endif | ||
1600 | |||
1601 | WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | ||
1602 | "pop %%"REG_BP" \n\t" | ||
1603 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1604 | |||
1605 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1606 | "a" (&c->redDither) | ||
1607 | ); | ||
1608 | return; | ||
1609 | case PIX_FMT_YUYV422: | ||
1610 | asm volatile( | ||
1611 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1612 | "mov %4, %%"REG_b" \n\t" | ||
1613 | "push %%"REG_BP" \n\t" | ||
1614 | YSCALEYUV2PACKED1(%%REGBP, %5) | ||
1615 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | ||
1616 | "pop %%"REG_BP" \n\t" | ||
1617 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1618 | |||
1619 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1620 | "a" (&c->redDither) | ||
1621 | ); | ||
1622 | return; | ||
1623 | } | ||
1624 | } | ||
1625 | else | ||
1626 | { | ||
1627 | switch(dstFormat) | ||
1628 | { | ||
1629 | case PIX_FMT_RGB32: | ||
1630 | asm volatile( | ||
1631 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1632 | "mov %4, %%"REG_b" \n\t" | ||
1633 | "push %%"REG_BP" \n\t" | ||
1634 | YSCALEYUV2RGB1b(%%REGBP, %5) | ||
1635 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP) | ||
1636 | "pop %%"REG_BP" \n\t" | ||
1637 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1638 | |||
1639 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1640 | "a" (&c->redDither) | ||
1641 | ); | ||
1642 | return; | ||
1643 | case PIX_FMT_BGR24: | ||
1644 | asm volatile( | ||
1645 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1646 | "mov %4, %%"REG_b" \n\t" | ||
1647 | "push %%"REG_BP" \n\t" | ||
1648 | YSCALEYUV2RGB1b(%%REGBP, %5) | ||
1649 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) | ||
1650 | "pop %%"REG_BP" \n\t" | ||
1651 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1652 | |||
1653 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1654 | "a" (&c->redDither) | ||
1655 | ); | ||
1656 | return; | ||
1657 | case PIX_FMT_BGR555: | ||
1658 | asm volatile( | ||
1659 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1660 | "mov %4, %%"REG_b" \n\t" | ||
1661 | "push %%"REG_BP" \n\t" | ||
1662 | YSCALEYUV2RGB1b(%%REGBP, %5) | ||
1663 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1664 | #ifdef DITHER1XBPP | ||
1665 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1666 | "paddusb "MANGLE(g5Dither)", %%mm4 \n\t" | ||
1667 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1668 | #endif | ||
1669 | WRITEBGR15(%%REGb, 8280(%5), %%REGBP) | ||
1670 | "pop %%"REG_BP" \n\t" | ||
1671 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1672 | |||
1673 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1674 | "a" (&c->redDither) | ||
1675 | ); | ||
1676 | return; | ||
1677 | case PIX_FMT_BGR565: | ||
1678 | asm volatile( | ||
1679 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1680 | "mov %4, %%"REG_b" \n\t" | ||
1681 | "push %%"REG_BP" \n\t" | ||
1682 | YSCALEYUV2RGB1b(%%REGBP, %5) | ||
1683 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | ||
1684 | #ifdef DITHER1XBPP | ||
1685 | "paddusb "MANGLE(b5Dither)", %%mm2 \n\t" | ||
1686 | "paddusb "MANGLE(g6Dither)", %%mm4 \n\t" | ||
1687 | "paddusb "MANGLE(r5Dither)", %%mm5 \n\t" | ||
1688 | #endif | ||
1689 | |||
1690 | WRITEBGR16(%%REGb, 8280(%5), %%REGBP) | ||
1691 | "pop %%"REG_BP" \n\t" | ||
1692 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1693 | |||
1694 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1695 | "a" (&c->redDither) | ||
1696 | ); | ||
1697 | return; | ||
1698 | case PIX_FMT_YUYV422: | ||
1699 | asm volatile( | ||
1700 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | ||
1701 | "mov %4, %%"REG_b" \n\t" | ||
1702 | "push %%"REG_BP" \n\t" | ||
1703 | YSCALEYUV2PACKED1b(%%REGBP, %5) | ||
1704 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) | ||
1705 | "pop %%"REG_BP" \n\t" | ||
1706 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | ||
1707 | |||
1708 | :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), | ||
1709 | "a" (&c->redDither) | ||
1710 | ); | ||
1711 | return; | ||
1712 | } | ||
1713 | } | ||
1714 | #endif /* HAVE_MMX */ | ||
1715 | if (uvalpha < 2048) | ||
1716 | { | ||
1717 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C) | ||
1718 | }else{ | ||
1719 | YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C) | ||
1720 | } | ||
1721 | } | ||
1722 | |||
1723 | //FIXME yuy2* can read up to 7 samples too much | ||
1724 | |||
1725 | static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) | ||
1726 | { | ||
1727 | #ifdef HAVE_MMX | ||
1728 | asm volatile( | ||
1729 | "movq "MANGLE(bm01010101)", %%mm2 \n\t" | ||
1730 | "mov %0, %%"REG_a" \n\t" | ||
1731 | "1: \n\t" | ||
1732 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" | ||
1733 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | ||
1734 | "pand %%mm2, %%mm0 \n\t" | ||
1735 | "pand %%mm2, %%mm1 \n\t" | ||
1736 | "packuswb %%mm1, %%mm0 \n\t" | ||
1737 | "movq %%mm0, (%2, %%"REG_a") \n\t" | ||
1738 | "add $8, %%"REG_a" \n\t" | ||
1739 | " js 1b \n\t" | ||
1740 | : : "g" (-width), "r" (src+width*2), "r" (dst+width) | ||
1741 | : "%"REG_a | ||
1742 | ); | ||
1743 | #else | ||
1744 | int i; | ||
1745 | for (i=0; i<width; i++) | ||
1746 | dst[i]= src[2*i]; | ||
1747 | #endif | ||
1748 | } | ||
1749 | |||
1750 | static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | ||
1751 | { | ||
1752 | #ifdef HAVE_MMX | ||
1753 | asm volatile( | ||
1754 | "movq "MANGLE(bm01010101)", %%mm4 \n\t" | ||
1755 | "mov %0, %%"REG_a" \n\t" | ||
1756 | "1: \n\t" | ||
1757 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" | ||
1758 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | ||
1759 | "psrlw $8, %%mm0 \n\t" | ||
1760 | "psrlw $8, %%mm1 \n\t" | ||
1761 | "packuswb %%mm1, %%mm0 \n\t" | ||
1762 | "movq %%mm0, %%mm1 \n\t" | ||
1763 | "psrlw $8, %%mm0 \n\t" | ||
1764 | "pand %%mm4, %%mm1 \n\t" | ||
1765 | "packuswb %%mm0, %%mm0 \n\t" | ||
1766 | "packuswb %%mm1, %%mm1 \n\t" | ||
1767 | "movd %%mm0, (%3, %%"REG_a") \n\t" | ||
1768 | "movd %%mm1, (%2, %%"REG_a") \n\t" | ||
1769 | "add $4, %%"REG_a" \n\t" | ||
1770 | " js 1b \n\t" | ||
1771 | : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) | ||
1772 | : "%"REG_a | ||
1773 | ); | ||
1774 | #else | ||
1775 | int i; | ||
1776 | for (i=0; i<width; i++) | ||
1777 | { | ||
1778 | dstU[i]= src1[4*i + 1]; | ||
1779 | dstV[i]= src1[4*i + 3]; | ||
1780 | } | ||
1781 | #endif | ||
1782 | assert(src1 == src2); | ||
1783 | } | ||
1784 | |||
1785 | /* This is almost identical to the previous, end exists only because | ||
1786 | * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */ | ||
1787 | static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width) | ||
1788 | { | ||
1789 | #ifdef HAVE_MMX | ||
1790 | asm volatile( | ||
1791 | "mov %0, %%"REG_a" \n\t" | ||
1792 | "1: \n\t" | ||
1793 | "movq (%1, %%"REG_a",2), %%mm0 \n\t" | ||
1794 | "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" | ||
1795 | "psrlw $8, %%mm0 \n\t" | ||
1796 | "psrlw $8, %%mm1 \n\t" | ||
1797 | "packuswb %%mm1, %%mm0 \n\t" | ||
1798 | "movq %%mm0, (%2, %%"REG_a") \n\t" | ||
1799 | "add $8, %%"REG_a" \n\t" | ||
1800 | " js 1b \n\t" | ||
1801 | : : "g" (-width), "r" (src+width*2), "r" (dst+width) | ||
1802 | : "%"REG_a | ||
1803 | ); | ||
1804 | #else | ||
1805 | int i; | ||
1806 | for (i=0; i<width; i++) | ||
1807 | dst[i]= src[2*i+1]; | ||
1808 | #endif | ||
1809 | } | ||
1810 | |||
1811 | static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | ||
1812 | { | ||
1813 | #ifdef HAVE_MMX | ||
1814 | asm volatile( | ||
1815 | "movq "MANGLE(bm01010101)", %%mm4 \n\t" | ||
1816 | "mov %0, %%"REG_a" \n\t" | ||
1817 | "1: \n\t" | ||
1818 | "movq (%1, %%"REG_a",4), %%mm0 \n\t" | ||
1819 | "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" | ||
1820 | "pand %%mm4, %%mm0 \n\t" | ||
1821 | "pand %%mm4, %%mm1 \n\t" | ||
1822 | "packuswb %%mm1, %%mm0 \n\t" | ||
1823 | "movq %%mm0, %%mm1 \n\t" | ||
1824 | "psrlw $8, %%mm0 \n\t" | ||
1825 | "pand %%mm4, %%mm1 \n\t" | ||
1826 | "packuswb %%mm0, %%mm0 \n\t" | ||
1827 | "packuswb %%mm1, %%mm1 \n\t" | ||
1828 | "movd %%mm0, (%3, %%"REG_a") \n\t" | ||
1829 | "movd %%mm1, (%2, %%"REG_a") \n\t" | ||
1830 | "add $4, %%"REG_a" \n\t" | ||
1831 | " js 1b \n\t" | ||
1832 | : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) | ||
1833 | : "%"REG_a | ||
1834 | ); | ||
1835 | #else | ||
1836 | int i; | ||
1837 | for (i=0; i<width; i++) | ||
1838 | { | ||
1839 | dstU[i]= src1[4*i + 0]; | ||
1840 | dstV[i]= src1[4*i + 2]; | ||
1841 | } | ||
1842 | #endif | ||
1843 | assert(src1 == src2); | ||
1844 | } | ||
1845 | |||
1846 | static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width) | ||
1847 | { | ||
1848 | int i; | ||
1849 | for (i=0; i<width; i++) | ||
1850 | { | ||
1851 | int b= ((uint32_t*)src)[i]&0xFF; | ||
1852 | int g= (((uint32_t*)src)[i]>>8)&0xFF; | ||
1853 | int r= (((uint32_t*)src)[i]>>16)&0xFF; | ||
1854 | |||
1855 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); | ||
1856 | } | ||
1857 | } | ||
1858 | |||
1859 | static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
1860 | { | ||
1861 | int i; | ||
1862 | assert(src1 == src2); | ||
1863 | for (i=0; i<width; i++) | ||
1864 | { | ||
1865 | const int a= ((uint32_t*)src1)[2*i+0]; | ||
1866 | const int e= ((uint32_t*)src1)[2*i+1]; | ||
1867 | const int l= (a&0xFF00FF) + (e&0xFF00FF); | ||
1868 | const int h= (a&0x00FF00) + (e&0x00FF00); | ||
1869 | const int b= l&0x3FF; | ||
1870 | const int g= h>>8; | ||
1871 | const int r= l>>16; | ||
1872 | |||
1873 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
1874 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
1875 | } | ||
1876 | } | ||
1877 | |||
1878 | static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width) | ||
1879 | { | ||
1880 | #ifdef HAVE_MMX | ||
1881 | asm volatile( | ||
1882 | "mov %2, %%"REG_a" \n\t" | ||
1883 | "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" | ||
1884 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
1885 | "pxor %%mm7, %%mm7 \n\t" | ||
1886 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | ||
1887 | ASMALIGN(4) | ||
1888 | "1: \n\t" | ||
1889 | PREFETCH" 64(%0, %%"REG_d") \n\t" | ||
1890 | "movd (%0, %%"REG_d"), %%mm0 \n\t" | ||
1891 | "movd 3(%0, %%"REG_d"), %%mm1 \n\t" | ||
1892 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1893 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1894 | "movd 6(%0, %%"REG_d"), %%mm2 \n\t" | ||
1895 | "movd 9(%0, %%"REG_d"), %%mm3 \n\t" | ||
1896 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1897 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1898 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
1899 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1900 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1901 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1902 | #ifndef FAST_BGR2YV12 | ||
1903 | "psrad $8, %%mm0 \n\t" | ||
1904 | "psrad $8, %%mm1 \n\t" | ||
1905 | "psrad $8, %%mm2 \n\t" | ||
1906 | "psrad $8, %%mm3 \n\t" | ||
1907 | #endif | ||
1908 | "packssdw %%mm1, %%mm0 \n\t" | ||
1909 | "packssdw %%mm3, %%mm2 \n\t" | ||
1910 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
1911 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1912 | "packssdw %%mm2, %%mm0 \n\t" | ||
1913 | "psraw $7, %%mm0 \n\t" | ||
1914 | |||
1915 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | ||
1916 | "movd 15(%0, %%"REG_d"), %%mm1 \n\t" | ||
1917 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1918 | "punpcklbw %%mm7, %%mm1 \n\t" | ||
1919 | "movd 18(%0, %%"REG_d"), %%mm2 \n\t" | ||
1920 | "movd 21(%0, %%"REG_d"), %%mm3 \n\t" | ||
1921 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1922 | "punpcklbw %%mm7, %%mm3 \n\t" | ||
1923 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
1924 | "pmaddwd %%mm6, %%mm1 \n\t" | ||
1925 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
1926 | "pmaddwd %%mm6, %%mm3 \n\t" | ||
1927 | #ifndef FAST_BGR2YV12 | ||
1928 | "psrad $8, %%mm4 \n\t" | ||
1929 | "psrad $8, %%mm1 \n\t" | ||
1930 | "psrad $8, %%mm2 \n\t" | ||
1931 | "psrad $8, %%mm3 \n\t" | ||
1932 | #endif | ||
1933 | "packssdw %%mm1, %%mm4 \n\t" | ||
1934 | "packssdw %%mm3, %%mm2 \n\t" | ||
1935 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
1936 | "pmaddwd %%mm5, %%mm2 \n\t" | ||
1937 | "add $24, %%"REG_d" \n\t" | ||
1938 | "packssdw %%mm2, %%mm4 \n\t" | ||
1939 | "psraw $7, %%mm4 \n\t" | ||
1940 | |||
1941 | "packuswb %%mm4, %%mm0 \n\t" | ||
1942 | "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" | ||
1943 | |||
1944 | "movq %%mm0, (%1, %%"REG_a") \n\t" | ||
1945 | "add $8, %%"REG_a" \n\t" | ||
1946 | " js 1b \n\t" | ||
1947 | : : "r" (src+width*3), "r" (dst+width), "g" (-width) | ||
1948 | : "%"REG_a, "%"REG_d | ||
1949 | ); | ||
1950 | #else | ||
1951 | int i; | ||
1952 | for (i=0; i<width; i++) | ||
1953 | { | ||
1954 | int b= src[i*3+0]; | ||
1955 | int g= src[i*3+1]; | ||
1956 | int r= src[i*3+2]; | ||
1957 | |||
1958 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); | ||
1959 | } | ||
1960 | #endif /* HAVE_MMX */ | ||
1961 | } | ||
1962 | |||
1963 | static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width) | ||
1964 | { | ||
1965 | #ifdef HAVE_MMX | ||
1966 | asm volatile( | ||
1967 | "mov %3, %%"REG_a" \n\t" | ||
1968 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
1969 | "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" | ||
1970 | "pxor %%mm7, %%mm7 \n\t" | ||
1971 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | ||
1972 | "add %%"REG_d", %%"REG_d" \n\t" | ||
1973 | ASMALIGN(4) | ||
1974 | "1: \n\t" | ||
1975 | PREFETCH" 64(%0, %%"REG_d") \n\t" | ||
1976 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | ||
1977 | "movq (%0, %%"REG_d"), %%mm0 \n\t" | ||
1978 | "movq 6(%0, %%"REG_d"), %%mm2 \n\t" | ||
1979 | "movq %%mm0, %%mm1 \n\t" | ||
1980 | "movq %%mm2, %%mm3 \n\t" | ||
1981 | "psrlq $24, %%mm0 \n\t" | ||
1982 | "psrlq $24, %%mm2 \n\t" | ||
1983 | PAVGB(%%mm1, %%mm0) | ||
1984 | PAVGB(%%mm3, %%mm2) | ||
1985 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1986 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1987 | #else | ||
1988 | "movd (%0, %%"REG_d"), %%mm0 \n\t" | ||
1989 | "movd 3(%0, %%"REG_d"), %%mm2 \n\t" | ||
1990 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
1991 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1992 | "paddw %%mm2, %%mm0 \n\t" | ||
1993 | "movd 6(%0, %%"REG_d"), %%mm4 \n\t" | ||
1994 | "movd 9(%0, %%"REG_d"), %%mm2 \n\t" | ||
1995 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
1996 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
1997 | "paddw %%mm4, %%mm2 \n\t" | ||
1998 | "psrlw $1, %%mm0 \n\t" | ||
1999 | "psrlw $1, %%mm2 \n\t" | ||
2000 | #endif | ||
2001 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" | ||
2002 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | ||
2003 | |||
2004 | "pmaddwd %%mm0, %%mm1 \n\t" | ||
2005 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2006 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
2007 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
2008 | #ifndef FAST_BGR2YV12 | ||
2009 | "psrad $8, %%mm0 \n\t" | ||
2010 | "psrad $8, %%mm1 \n\t" | ||
2011 | "psrad $8, %%mm2 \n\t" | ||
2012 | "psrad $8, %%mm3 \n\t" | ||
2013 | #endif | ||
2014 | "packssdw %%mm2, %%mm0 \n\t" | ||
2015 | "packssdw %%mm3, %%mm1 \n\t" | ||
2016 | "pmaddwd %%mm5, %%mm0 \n\t" | ||
2017 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
2018 | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 | ||
2019 | "psraw $7, %%mm0 \n\t" | ||
2020 | |||
2021 | #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | ||
2022 | "movq 12(%0, %%"REG_d"), %%mm4 \n\t" | ||
2023 | "movq 18(%0, %%"REG_d"), %%mm2 \n\t" | ||
2024 | "movq %%mm4, %%mm1 \n\t" | ||
2025 | "movq %%mm2, %%mm3 \n\t" | ||
2026 | "psrlq $24, %%mm4 \n\t" | ||
2027 | "psrlq $24, %%mm2 \n\t" | ||
2028 | PAVGB(%%mm1, %%mm4) | ||
2029 | PAVGB(%%mm3, %%mm2) | ||
2030 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2031 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2032 | #else | ||
2033 | "movd 12(%0, %%"REG_d"), %%mm4 \n\t" | ||
2034 | "movd 15(%0, %%"REG_d"), %%mm2 \n\t" | ||
2035 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2036 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2037 | "paddw %%mm2, %%mm4 \n\t" | ||
2038 | "movd 18(%0, %%"REG_d"), %%mm5 \n\t" | ||
2039 | "movd 21(%0, %%"REG_d"), %%mm2 \n\t" | ||
2040 | "punpcklbw %%mm7, %%mm5 \n\t" | ||
2041 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2042 | "paddw %%mm5, %%mm2 \n\t" | ||
2043 | "movq "MANGLE(ff_w1111)", %%mm5 \n\t" | ||
2044 | "psrlw $2, %%mm4 \n\t" | ||
2045 | "psrlw $2, %%mm2 \n\t" | ||
2046 | #endif | ||
2047 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" | ||
2048 | "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" | ||
2049 | |||
2050 | "pmaddwd %%mm4, %%mm1 \n\t" | ||
2051 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2052 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
2053 | "pmaddwd %%mm6, %%mm2 \n\t" | ||
2054 | #ifndef FAST_BGR2YV12 | ||
2055 | "psrad $8, %%mm4 \n\t" | ||
2056 | "psrad $8, %%mm1 \n\t" | ||
2057 | "psrad $8, %%mm2 \n\t" | ||
2058 | "psrad $8, %%mm3 \n\t" | ||
2059 | #endif | ||
2060 | "packssdw %%mm2, %%mm4 \n\t" | ||
2061 | "packssdw %%mm3, %%mm1 \n\t" | ||
2062 | "pmaddwd %%mm5, %%mm4 \n\t" | ||
2063 | "pmaddwd %%mm5, %%mm1 \n\t" | ||
2064 | "add $24, %%"REG_d" \n\t" | ||
2065 | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | ||
2066 | "psraw $7, %%mm4 \n\t" | ||
2067 | |||
2068 | "movq %%mm0, %%mm1 \n\t" | ||
2069 | "punpckldq %%mm4, %%mm0 \n\t" | ||
2070 | "punpckhdq %%mm4, %%mm1 \n\t" | ||
2071 | "packsswb %%mm1, %%mm0 \n\t" | ||
2072 | "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" | ||
2073 | |||
2074 | "movd %%mm0, (%1, %%"REG_a") \n\t" | ||
2075 | "punpckhdq %%mm0, %%mm0 \n\t" | ||
2076 | "movd %%mm0, (%2, %%"REG_a") \n\t" | ||
2077 | "add $4, %%"REG_a" \n\t" | ||
2078 | " js 1b \n\t" | ||
2079 | : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width) | ||
2080 | : "%"REG_a, "%"REG_d | ||
2081 | ); | ||
2082 | #else | ||
2083 | int i; | ||
2084 | for (i=0; i<width; i++) | ||
2085 | { | ||
2086 | int b= src1[6*i + 0] + src1[6*i + 3]; | ||
2087 | int g= src1[6*i + 1] + src1[6*i + 4]; | ||
2088 | int r= src1[6*i + 2] + src1[6*i + 5]; | ||
2089 | |||
2090 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
2091 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
2092 | } | ||
2093 | #endif /* HAVE_MMX */ | ||
2094 | assert(src1 == src2); | ||
2095 | } | ||
2096 | |||
2097 | static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width) | ||
2098 | { | ||
2099 | int i; | ||
2100 | for (i=0; i<width; i++) | ||
2101 | { | ||
2102 | int d= ((uint16_t*)src)[i]; | ||
2103 | int b= d&0x1F; | ||
2104 | int g= (d>>5)&0x3F; | ||
2105 | int r= (d>>11)&0x1F; | ||
2106 | |||
2107 | dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | ||
2108 | } | ||
2109 | } | ||
2110 | |||
2111 | static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
2112 | { | ||
2113 | int i; | ||
2114 | assert(src1==src2); | ||
2115 | for (i=0; i<width; i++) | ||
2116 | { | ||
2117 | int d0= ((uint32_t*)src1)[i]; | ||
2118 | |||
2119 | int dl= (d0&0x07E0F81F); | ||
2120 | int dh= ((d0>>5)&0x07C0F83F); | ||
2121 | |||
2122 | int dh2= (dh>>11) + (dh<<21); | ||
2123 | int d= dh2 + dl; | ||
2124 | |||
2125 | int b= d&0x7F; | ||
2126 | int r= (d>>11)&0x7F; | ||
2127 | int g= d>>21; | ||
2128 | dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128; | ||
2129 | dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128; | ||
2130 | } | ||
2131 | } | ||
2132 | |||
2133 | static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width) | ||
2134 | { | ||
2135 | int i; | ||
2136 | for (i=0; i<width; i++) | ||
2137 | { | ||
2138 | int d= ((uint16_t*)src)[i]; | ||
2139 | int b= d&0x1F; | ||
2140 | int g= (d>>5)&0x1F; | ||
2141 | int r= (d>>10)&0x1F; | ||
2142 | |||
2143 | dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | ||
2144 | } | ||
2145 | } | ||
2146 | |||
2147 | static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
2148 | { | ||
2149 | int i; | ||
2150 | assert(src1==src2); | ||
2151 | for (i=0; i<width; i++) | ||
2152 | { | ||
2153 | int d0= ((uint32_t*)src1)[i]; | ||
2154 | |||
2155 | int dl= (d0&0x03E07C1F); | ||
2156 | int dh= ((d0>>5)&0x03E0F81F); | ||
2157 | |||
2158 | int dh2= (dh>>11) + (dh<<21); | ||
2159 | int d= dh2 + dl; | ||
2160 | |||
2161 | int b= d&0x7F; | ||
2162 | int r= (d>>10)&0x7F; | ||
2163 | int g= d>>21; | ||
2164 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128; | ||
2165 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128; | ||
2166 | } | ||
2167 | } | ||
2168 | |||
2169 | |||
2170 | static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width) | ||
2171 | { | ||
2172 | int i; | ||
2173 | for (i=0; i<width; i++) | ||
2174 | { | ||
2175 | int r= ((uint32_t*)src)[i]&0xFF; | ||
2176 | int g= (((uint32_t*)src)[i]>>8)&0xFF; | ||
2177 | int b= (((uint32_t*)src)[i]>>16)&0xFF; | ||
2178 | |||
2179 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); | ||
2180 | } | ||
2181 | } | ||
2182 | |||
2183 | static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
2184 | { | ||
2185 | int i; | ||
2186 | assert(src1==src2); | ||
2187 | for (i=0; i<width; i++) | ||
2188 | { | ||
2189 | const int a= ((uint32_t*)src1)[2*i+0]; | ||
2190 | const int e= ((uint32_t*)src1)[2*i+1]; | ||
2191 | const int l= (a&0xFF00FF) + (e&0xFF00FF); | ||
2192 | const int h= (a&0x00FF00) + (e&0x00FF00); | ||
2193 | const int r= l&0x3FF; | ||
2194 | const int g= h>>8; | ||
2195 | const int b= l>>16; | ||
2196 | |||
2197 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
2198 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
2199 | } | ||
2200 | } | ||
2201 | |||
2202 | static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width) | ||
2203 | { | ||
2204 | int i; | ||
2205 | for (i=0; i<width; i++) | ||
2206 | { | ||
2207 | int r= src[i*3+0]; | ||
2208 | int g= src[i*3+1]; | ||
2209 | int b= src[i*3+2]; | ||
2210 | |||
2211 | dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT); | ||
2212 | } | ||
2213 | } | ||
2214 | |||
2215 | static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
2216 | { | ||
2217 | int i; | ||
2218 | assert(src1==src2); | ||
2219 | for (i=0; i<width; i++) | ||
2220 | { | ||
2221 | int r= src1[6*i + 0] + src1[6*i + 3]; | ||
2222 | int g= src1[6*i + 1] + src1[6*i + 4]; | ||
2223 | int b= src1[6*i + 2] + src1[6*i + 5]; | ||
2224 | |||
2225 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
2226 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128; | ||
2227 | } | ||
2228 | } | ||
2229 | |||
2230 | static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width) | ||
2231 | { | ||
2232 | int i; | ||
2233 | for (i=0; i<width; i++) | ||
2234 | { | ||
2235 | int d= ((uint16_t*)src)[i]; | ||
2236 | int r= d&0x1F; | ||
2237 | int g= (d>>5)&0x3F; | ||
2238 | int b= (d>>11)&0x1F; | ||
2239 | |||
2240 | dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16; | ||
2241 | } | ||
2242 | } | ||
2243 | |||
2244 | static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
2245 | { | ||
2246 | int i; | ||
2247 | assert(src1 == src2); | ||
2248 | for (i=0; i<width; i++) | ||
2249 | { | ||
2250 | int d0= ((uint32_t*)src1)[i]; | ||
2251 | |||
2252 | int dl= (d0&0x07E0F81F); | ||
2253 | int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F); | ||
2254 | |||
2255 | int r= d&0x3F; | ||
2256 | int b= (d>>11)&0x3F; | ||
2257 | int g= d>>21; | ||
2258 | dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128; | ||
2259 | dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128; | ||
2260 | } | ||
2261 | } | ||
2262 | |||
2263 | static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width) | ||
2264 | { | ||
2265 | int i; | ||
2266 | for (i=0; i<width; i++) | ||
2267 | { | ||
2268 | int d= ((uint16_t*)src)[i]; | ||
2269 | int r= d&0x1F; | ||
2270 | int g= (d>>5)&0x1F; | ||
2271 | int b= (d>>10)&0x1F; | ||
2272 | |||
2273 | dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16; | ||
2274 | } | ||
2275 | } | ||
2276 | |||
2277 | static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width) | ||
2278 | { | ||
2279 | int i; | ||
2280 | assert(src1 == src2); | ||
2281 | for (i=0; i<width; i++) | ||
2282 | { | ||
2283 | int d0= ((uint32_t*)src1)[i]; | ||
2284 | |||
2285 | int dl= (d0&0x03E07C1F); | ||
2286 | int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F); | ||
2287 | |||
2288 | int r= d&0x3F; | ||
2289 | int b= (d>>10)&0x3F; | ||
2290 | int g= d>>21; | ||
2291 | dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128; | ||
2292 | dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128; | ||
2293 | } | ||
2294 | } | ||
2295 | |||
2296 | static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal) | ||
2297 | { | ||
2298 | int i; | ||
2299 | for (i=0; i<width; i++) | ||
2300 | { | ||
2301 | int d= src[i]; | ||
2302 | |||
2303 | dst[i]= pal[d] & 0xFF; | ||
2304 | } | ||
2305 | } | ||
2306 | |||
2307 | static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal) | ||
2308 | { | ||
2309 | int i; | ||
2310 | assert(src1 == src2); | ||
2311 | for (i=0; i<width; i++) | ||
2312 | { | ||
2313 | int p= pal[src1[i]]; | ||
2314 | |||
2315 | dstU[i]= p>>8; | ||
2316 | dstV[i]= p>>16; | ||
2317 | } | ||
2318 | } | ||
2319 | |||
2320 | // bilinear / bicubic scaling | ||
2321 | static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, | ||
2322 | int16_t *filter, int16_t *filterPos, long filterSize) | ||
2323 | { | ||
2324 | #ifdef HAVE_MMX | ||
2325 | assert(filterSize % 4 == 0 && filterSize>0); | ||
2326 | if (filterSize==4) // Always true for upscaling, sometimes for down, too. | ||
2327 | { | ||
2328 | long counter= -2*dstW; | ||
2329 | filter-= counter*2; | ||
2330 | filterPos-= counter/2; | ||
2331 | dst-= counter/2; | ||
2332 | asm volatile( | ||
2333 | #if defined(PIC) | ||
2334 | "push %%"REG_b" \n\t" | ||
2335 | #endif | ||
2336 | "pxor %%mm7, %%mm7 \n\t" | ||
2337 | "movq "MANGLE(w02)", %%mm6 \n\t" | ||
2338 | "push %%"REG_BP" \n\t" // we use 7 regs here ... | ||
2339 | "mov %%"REG_a", %%"REG_BP" \n\t" | ||
2340 | ASMALIGN(4) | ||
2341 | "1: \n\t" | ||
2342 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" | ||
2343 | "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | ||
2344 | "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" | ||
2345 | "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" | ||
2346 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | ||
2347 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | ||
2348 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
2349 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2350 | "pmaddwd %%mm1, %%mm0 \n\t" | ||
2351 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2352 | "psrad $8, %%mm0 \n\t" | ||
2353 | "psrad $8, %%mm3 \n\t" | ||
2354 | "packssdw %%mm3, %%mm0 \n\t" | ||
2355 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
2356 | "packssdw %%mm0, %%mm0 \n\t" | ||
2357 | "movd %%mm0, (%4, %%"REG_BP") \n\t" | ||
2358 | "add $4, %%"REG_BP" \n\t" | ||
2359 | " jnc 1b \n\t" | ||
2360 | |||
2361 | "pop %%"REG_BP" \n\t" | ||
2362 | #if defined(PIC) | ||
2363 | "pop %%"REG_b" \n\t" | ||
2364 | #endif | ||
2365 | : "+a" (counter) | ||
2366 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | ||
2367 | #if !defined(PIC) | ||
2368 | : "%"REG_b | ||
2369 | #endif | ||
2370 | ); | ||
2371 | } | ||
2372 | else if (filterSize==8) | ||
2373 | { | ||
2374 | long counter= -2*dstW; | ||
2375 | filter-= counter*4; | ||
2376 | filterPos-= counter/2; | ||
2377 | dst-= counter/2; | ||
2378 | asm volatile( | ||
2379 | #if defined(PIC) | ||
2380 | "push %%"REG_b" \n\t" | ||
2381 | #endif | ||
2382 | "pxor %%mm7, %%mm7 \n\t" | ||
2383 | "movq "MANGLE(w02)", %%mm6 \n\t" | ||
2384 | "push %%"REG_BP" \n\t" // we use 7 regs here ... | ||
2385 | "mov %%"REG_a", %%"REG_BP" \n\t" | ||
2386 | ASMALIGN(4) | ||
2387 | "1: \n\t" | ||
2388 | "movzwl (%2, %%"REG_BP"), %%eax \n\t" | ||
2389 | "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" | ||
2390 | "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" | ||
2391 | "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" | ||
2392 | "movd (%3, %%"REG_a"), %%mm0 \n\t" | ||
2393 | "movd (%3, %%"REG_b"), %%mm2 \n\t" | ||
2394 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
2395 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2396 | "pmaddwd %%mm1, %%mm0 \n\t" | ||
2397 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2398 | |||
2399 | "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" | ||
2400 | "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" | ||
2401 | "movd 4(%3, %%"REG_a"), %%mm4 \n\t" | ||
2402 | "movd 4(%3, %%"REG_b"), %%mm2 \n\t" | ||
2403 | "punpcklbw %%mm7, %%mm4 \n\t" | ||
2404 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2405 | "pmaddwd %%mm1, %%mm4 \n\t" | ||
2406 | "pmaddwd %%mm2, %%mm5 \n\t" | ||
2407 | "paddd %%mm4, %%mm0 \n\t" | ||
2408 | "paddd %%mm5, %%mm3 \n\t" | ||
2409 | |||
2410 | "psrad $8, %%mm0 \n\t" | ||
2411 | "psrad $8, %%mm3 \n\t" | ||
2412 | "packssdw %%mm3, %%mm0 \n\t" | ||
2413 | "pmaddwd %%mm6, %%mm0 \n\t" | ||
2414 | "packssdw %%mm0, %%mm0 \n\t" | ||
2415 | "movd %%mm0, (%4, %%"REG_BP") \n\t" | ||
2416 | "add $4, %%"REG_BP" \n\t" | ||
2417 | " jnc 1b \n\t" | ||
2418 | |||
2419 | "pop %%"REG_BP" \n\t" | ||
2420 | #if defined(PIC) | ||
2421 | "pop %%"REG_b" \n\t" | ||
2422 | #endif | ||
2423 | : "+a" (counter) | ||
2424 | : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) | ||
2425 | #if !defined(PIC) | ||
2426 | : "%"REG_b | ||
2427 | #endif | ||
2428 | ); | ||
2429 | } | ||
2430 | else | ||
2431 | { | ||
2432 | uint8_t *offset = src+filterSize; | ||
2433 | long counter= -2*dstW; | ||
2434 | //filter-= counter*filterSize/2; | ||
2435 | filterPos-= counter/2; | ||
2436 | dst-= counter/2; | ||
2437 | asm volatile( | ||
2438 | "pxor %%mm7, %%mm7 \n\t" | ||
2439 | "movq "MANGLE(w02)", %%mm6 \n\t" | ||
2440 | ASMALIGN(4) | ||
2441 | "1: \n\t" | ||
2442 | "mov %2, %%"REG_c" \n\t" | ||
2443 | "movzwl (%%"REG_c", %0), %%eax \n\t" | ||
2444 | "movzwl 2(%%"REG_c", %0), %%edx \n\t" | ||
2445 | "mov %5, %%"REG_c" \n\t" | ||
2446 | "pxor %%mm4, %%mm4 \n\t" | ||
2447 | "pxor %%mm5, %%mm5 \n\t" | ||
2448 | "2: \n\t" | ||
2449 | "movq (%1), %%mm1 \n\t" | ||
2450 | "movq (%1, %6), %%mm3 \n\t" | ||
2451 | "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" | ||
2452 | "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" | ||
2453 | "punpcklbw %%mm7, %%mm0 \n\t" | ||
2454 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2455 | "pmaddwd %%mm1, %%mm0 \n\t" | ||
2456 | "pmaddwd %%mm2, %%mm3 \n\t" | ||
2457 | "paddd %%mm3, %%mm5 \n\t" | ||
2458 | "paddd %%mm0, %%mm4 \n\t" | ||
2459 | "add $8, %1 \n\t" | ||
2460 | "add $4, %%"REG_c" \n\t" | ||
2461 | "cmp %4, %%"REG_c" \n\t" | ||
2462 | " jb 2b \n\t" | ||
2463 | "add %6, %1 \n\t" | ||
2464 | "psrad $8, %%mm4 \n\t" | ||
2465 | "psrad $8, %%mm5 \n\t" | ||
2466 | "packssdw %%mm5, %%mm4 \n\t" | ||
2467 | "pmaddwd %%mm6, %%mm4 \n\t" | ||
2468 | "packssdw %%mm4, %%mm4 \n\t" | ||
2469 | "mov %3, %%"REG_a" \n\t" | ||
2470 | "movd %%mm4, (%%"REG_a", %0) \n\t" | ||
2471 | "add $4, %0 \n\t" | ||
2472 | " jnc 1b \n\t" | ||
2473 | |||
2474 | : "+r" (counter), "+r" (filter) | ||
2475 | : "m" (filterPos), "m" (dst), "m"(offset), | ||
2476 | "m" (src), "r" (filterSize*2) | ||
2477 | : "%"REG_a, "%"REG_c, "%"REG_d | ||
2478 | ); | ||
2479 | } | ||
2480 | #else | ||
2481 | #ifdef HAVE_ALTIVEC | ||
2482 | hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); | ||
2483 | #else | ||
2484 | int i; | ||
2485 | for (i=0; i<dstW; i++) | ||
2486 | { | ||
2487 | int j; | ||
2488 | int srcPos= filterPos[i]; | ||
2489 | int val=0; | ||
2490 | //printf("filterPos: %d\n", filterPos[i]); | ||
2491 | for (j=0; j<filterSize; j++) | ||
2492 | { | ||
2493 | //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); | ||
2494 | val += ((int)src[srcPos + j])*filter[filterSize*i + j]; | ||
2495 | } | ||
2496 | //filter += hFilterSize; | ||
2497 | dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ... | ||
2498 | //dst[i] = val>>7; | ||
2499 | } | ||
2500 | #endif /* HAVE_ALTIVEC */ | ||
2501 | #endif /* HAVE_MMX */ | ||
2502 | } | ||
2503 | // *** horizontal scale Y line to temp buffer | ||
2504 | static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc, | ||
2505 | int flags, int canMMX2BeUsed, int16_t *hLumFilter, | ||
2506 | int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, | ||
2507 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | ||
2508 | int32_t *mmx2FilterPos, uint8_t *pal) | ||
2509 | { | ||
2510 | if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE) | ||
2511 | { | ||
2512 | RENAME(yuy2ToY)(formatConvBuffer, src, srcW); | ||
2513 | src= formatConvBuffer; | ||
2514 | } | ||
2515 | else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE) | ||
2516 | { | ||
2517 | RENAME(uyvyToY)(formatConvBuffer, src, srcW); | ||
2518 | src= formatConvBuffer; | ||
2519 | } | ||
2520 | else if (srcFormat==PIX_FMT_RGB32) | ||
2521 | { | ||
2522 | RENAME(bgr32ToY)(formatConvBuffer, src, srcW); | ||
2523 | src= formatConvBuffer; | ||
2524 | } | ||
2525 | else if (srcFormat==PIX_FMT_BGR24) | ||
2526 | { | ||
2527 | RENAME(bgr24ToY)(formatConvBuffer, src, srcW); | ||
2528 | src= formatConvBuffer; | ||
2529 | } | ||
2530 | else if (srcFormat==PIX_FMT_BGR565) | ||
2531 | { | ||
2532 | RENAME(bgr16ToY)(formatConvBuffer, src, srcW); | ||
2533 | src= formatConvBuffer; | ||
2534 | } | ||
2535 | else if (srcFormat==PIX_FMT_BGR555) | ||
2536 | { | ||
2537 | RENAME(bgr15ToY)(formatConvBuffer, src, srcW); | ||
2538 | src= formatConvBuffer; | ||
2539 | } | ||
2540 | else if (srcFormat==PIX_FMT_BGR32) | ||
2541 | { | ||
2542 | RENAME(rgb32ToY)(formatConvBuffer, src, srcW); | ||
2543 | src= formatConvBuffer; | ||
2544 | } | ||
2545 | else if (srcFormat==PIX_FMT_RGB24) | ||
2546 | { | ||
2547 | RENAME(rgb24ToY)(formatConvBuffer, src, srcW); | ||
2548 | src= formatConvBuffer; | ||
2549 | } | ||
2550 | else if (srcFormat==PIX_FMT_RGB565) | ||
2551 | { | ||
2552 | RENAME(rgb16ToY)(formatConvBuffer, src, srcW); | ||
2553 | src= formatConvBuffer; | ||
2554 | } | ||
2555 | else if (srcFormat==PIX_FMT_RGB555) | ||
2556 | { | ||
2557 | RENAME(rgb15ToY)(formatConvBuffer, src, srcW); | ||
2558 | src= formatConvBuffer; | ||
2559 | } | ||
2560 | else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) | ||
2561 | { | ||
2562 | RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal); | ||
2563 | src= formatConvBuffer; | ||
2564 | } | ||
2565 | |||
2566 | #ifdef HAVE_MMX | ||
2567 | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). | ||
2568 | if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | ||
2569 | #else | ||
2570 | if (!(flags&SWS_FAST_BILINEAR)) | ||
2571 | #endif | ||
2572 | { | ||
2573 | RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); | ||
2574 | } | ||
2575 | else // fast bilinear upscale / crap downscale | ||
2576 | { | ||
2577 | #if defined(ARCH_X86) | ||
2578 | #ifdef HAVE_MMX2 | ||
2579 | int i; | ||
2580 | #if defined(PIC) | ||
2581 | uint64_t ebxsave __attribute__((aligned(8))); | ||
2582 | #endif | ||
2583 | if (canMMX2BeUsed) | ||
2584 | { | ||
2585 | asm volatile( | ||
2586 | #if defined(PIC) | ||
2587 | "mov %%"REG_b", %5 \n\t" | ||
2588 | #endif | ||
2589 | "pxor %%mm7, %%mm7 \n\t" | ||
2590 | "mov %0, %%"REG_c" \n\t" | ||
2591 | "mov %1, %%"REG_D" \n\t" | ||
2592 | "mov %2, %%"REG_d" \n\t" | ||
2593 | "mov %3, %%"REG_b" \n\t" | ||
2594 | "xor %%"REG_a", %%"REG_a" \n\t" // i | ||
2595 | PREFETCH" (%%"REG_c") \n\t" | ||
2596 | PREFETCH" 32(%%"REG_c") \n\t" | ||
2597 | PREFETCH" 64(%%"REG_c") \n\t" | ||
2598 | |||
2599 | #ifdef ARCH_X86_64 | ||
2600 | |||
2601 | #define FUNNY_Y_CODE \ | ||
2602 | "movl (%%"REG_b"), %%esi \n\t"\ | ||
2603 | "call *%4 \n\t"\ | ||
2604 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | ||
2605 | "add %%"REG_S", %%"REG_c" \n\t"\ | ||
2606 | "add %%"REG_a", %%"REG_D" \n\t"\ | ||
2607 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
2608 | |||
2609 | #else | ||
2610 | |||
2611 | #define FUNNY_Y_CODE \ | ||
2612 | "movl (%%"REG_b"), %%esi \n\t"\ | ||
2613 | "call *%4 \n\t"\ | ||
2614 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | ||
2615 | "add %%"REG_a", %%"REG_D" \n\t"\ | ||
2616 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
2617 | |||
2618 | #endif /* ARCH_X86_64 */ | ||
2619 | |||
2620 | FUNNY_Y_CODE | ||
2621 | FUNNY_Y_CODE | ||
2622 | FUNNY_Y_CODE | ||
2623 | FUNNY_Y_CODE | ||
2624 | FUNNY_Y_CODE | ||
2625 | FUNNY_Y_CODE | ||
2626 | FUNNY_Y_CODE | ||
2627 | FUNNY_Y_CODE | ||
2628 | |||
2629 | #if defined(PIC) | ||
2630 | "mov %5, %%"REG_b" \n\t" | ||
2631 | #endif | ||
2632 | :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), | ||
2633 | "m" (funnyYCode) | ||
2634 | #if defined(PIC) | ||
2635 | ,"m" (ebxsave) | ||
2636 | #endif | ||
2637 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | ||
2638 | #if !defined(PIC) | ||
2639 | ,"%"REG_b | ||
2640 | #endif | ||
2641 | ); | ||
2642 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; | ||
2643 | } | ||
2644 | else | ||
2645 | { | ||
2646 | #endif /* HAVE_MMX2 */ | ||
2647 | long xInc_shr16 = xInc >> 16; | ||
2648 | uint16_t xInc_mask = xInc & 0xffff; | ||
2649 | //NO MMX just normal asm ... | ||
2650 | asm volatile( | ||
2651 | "xor %%"REG_a", %%"REG_a" \n\t" // i | ||
2652 | "xor %%"REG_d", %%"REG_d" \n\t" // xx | ||
2653 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | ||
2654 | ASMALIGN(4) | ||
2655 | "1: \n\t" | ||
2656 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] | ||
2657 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | ||
2658 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||
2659 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||
2660 | "shll $16, %%edi \n\t" | ||
2661 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||
2662 | "mov %1, %%"REG_D" \n\t" | ||
2663 | "shrl $9, %%esi \n\t" | ||
2664 | "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" | ||
2665 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | ||
2666 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | ||
2667 | |||
2668 | "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] | ||
2669 | "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] | ||
2670 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||
2671 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||
2672 | "shll $16, %%edi \n\t" | ||
2673 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||
2674 | "mov %1, %%"REG_D" \n\t" | ||
2675 | "shrl $9, %%esi \n\t" | ||
2676 | "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" | ||
2677 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | ||
2678 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | ||
2679 | |||
2680 | |||
2681 | "add $2, %%"REG_a" \n\t" | ||
2682 | "cmp %2, %%"REG_a" \n\t" | ||
2683 | " jb 1b \n\t" | ||
2684 | |||
2685 | |||
2686 | :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) | ||
2687 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" | ||
2688 | ); | ||
2689 | #ifdef HAVE_MMX2 | ||
2690 | } //if MMX2 can't be used | ||
2691 | #endif | ||
2692 | #else | ||
2693 | int i; | ||
2694 | unsigned int xpos=0; | ||
2695 | for (i=0;i<dstWidth;i++) | ||
2696 | { | ||
2697 | register unsigned int xx=xpos>>16; | ||
2698 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | ||
2699 | dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; | ||
2700 | xpos+=xInc; | ||
2701 | } | ||
2702 | #endif /* defined(ARCH_X86) */ | ||
2703 | } | ||
2704 | } | ||
2705 | |||
2706 | inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2, | ||
2707 | int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter, | ||
2708 | int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode, | ||
2709 | int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter, | ||
2710 | int32_t *mmx2FilterPos, uint8_t *pal) | ||
2711 | { | ||
2712 | if (srcFormat==PIX_FMT_YUYV422) | ||
2713 | { | ||
2714 | RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2715 | src1= formatConvBuffer; | ||
2716 | src2= formatConvBuffer+VOFW; | ||
2717 | } | ||
2718 | else if (srcFormat==PIX_FMT_UYVY422) | ||
2719 | { | ||
2720 | RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2721 | src1= formatConvBuffer; | ||
2722 | src2= formatConvBuffer+VOFW; | ||
2723 | } | ||
2724 | else if (srcFormat==PIX_FMT_RGB32) | ||
2725 | { | ||
2726 | RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2727 | src1= formatConvBuffer; | ||
2728 | src2= formatConvBuffer+VOFW; | ||
2729 | } | ||
2730 | else if (srcFormat==PIX_FMT_BGR24) | ||
2731 | { | ||
2732 | RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2733 | src1= formatConvBuffer; | ||
2734 | src2= formatConvBuffer+VOFW; | ||
2735 | } | ||
2736 | else if (srcFormat==PIX_FMT_BGR565) | ||
2737 | { | ||
2738 | RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2739 | src1= formatConvBuffer; | ||
2740 | src2= formatConvBuffer+VOFW; | ||
2741 | } | ||
2742 | else if (srcFormat==PIX_FMT_BGR555) | ||
2743 | { | ||
2744 | RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2745 | src1= formatConvBuffer; | ||
2746 | src2= formatConvBuffer+VOFW; | ||
2747 | } | ||
2748 | else if (srcFormat==PIX_FMT_BGR32) | ||
2749 | { | ||
2750 | RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2751 | src1= formatConvBuffer; | ||
2752 | src2= formatConvBuffer+VOFW; | ||
2753 | } | ||
2754 | else if (srcFormat==PIX_FMT_RGB24) | ||
2755 | { | ||
2756 | RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2757 | src1= formatConvBuffer; | ||
2758 | src2= formatConvBuffer+VOFW; | ||
2759 | } | ||
2760 | else if (srcFormat==PIX_FMT_RGB565) | ||
2761 | { | ||
2762 | RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2763 | src1= formatConvBuffer; | ||
2764 | src2= formatConvBuffer+VOFW; | ||
2765 | } | ||
2766 | else if (srcFormat==PIX_FMT_RGB555) | ||
2767 | { | ||
2768 | RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW); | ||
2769 | src1= formatConvBuffer; | ||
2770 | src2= formatConvBuffer+VOFW; | ||
2771 | } | ||
2772 | else if (isGray(srcFormat)) | ||
2773 | { | ||
2774 | return; | ||
2775 | } | ||
2776 | else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE || srcFormat==PIX_FMT_RGB4_BYTE) | ||
2777 | { | ||
2778 | RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal); | ||
2779 | src1= formatConvBuffer; | ||
2780 | src2= formatConvBuffer+VOFW; | ||
2781 | } | ||
2782 | |||
2783 | #ifdef HAVE_MMX | ||
2784 | // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). | ||
2785 | if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) | ||
2786 | #else | ||
2787 | if (!(flags&SWS_FAST_BILINEAR)) | ||
2788 | #endif | ||
2789 | { | ||
2790 | RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | ||
2791 | RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); | ||
2792 | } | ||
2793 | else // fast bilinear upscale / crap downscale | ||
2794 | { | ||
2795 | #if defined(ARCH_X86) | ||
2796 | #ifdef HAVE_MMX2 | ||
2797 | int i; | ||
2798 | #if defined(PIC) | ||
2799 | uint64_t ebxsave __attribute__((aligned(8))); | ||
2800 | #endif | ||
2801 | if (canMMX2BeUsed) | ||
2802 | { | ||
2803 | asm volatile( | ||
2804 | #if defined(PIC) | ||
2805 | "mov %%"REG_b", %6 \n\t" | ||
2806 | #endif | ||
2807 | "pxor %%mm7, %%mm7 \n\t" | ||
2808 | "mov %0, %%"REG_c" \n\t" | ||
2809 | "mov %1, %%"REG_D" \n\t" | ||
2810 | "mov %2, %%"REG_d" \n\t" | ||
2811 | "mov %3, %%"REG_b" \n\t" | ||
2812 | "xor %%"REG_a", %%"REG_a" \n\t" // i | ||
2813 | PREFETCH" (%%"REG_c") \n\t" | ||
2814 | PREFETCH" 32(%%"REG_c") \n\t" | ||
2815 | PREFETCH" 64(%%"REG_c") \n\t" | ||
2816 | |||
2817 | #ifdef ARCH_X86_64 | ||
2818 | |||
2819 | #define FUNNY_UV_CODE \ | ||
2820 | "movl (%%"REG_b"), %%esi \n\t"\ | ||
2821 | "call *%4 \n\t"\ | ||
2822 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | ||
2823 | "add %%"REG_S", %%"REG_c" \n\t"\ | ||
2824 | "add %%"REG_a", %%"REG_D" \n\t"\ | ||
2825 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
2826 | |||
2827 | #else | ||
2828 | |||
2829 | #define FUNNY_UV_CODE \ | ||
2830 | "movl (%%"REG_b"), %%esi \n\t"\ | ||
2831 | "call *%4 \n\t"\ | ||
2832 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | ||
2833 | "add %%"REG_a", %%"REG_D" \n\t"\ | ||
2834 | "xor %%"REG_a", %%"REG_a" \n\t"\ | ||
2835 | |||
2836 | #endif /* ARCH_X86_64 */ | ||
2837 | |||
2838 | FUNNY_UV_CODE | ||
2839 | FUNNY_UV_CODE | ||
2840 | FUNNY_UV_CODE | ||
2841 | FUNNY_UV_CODE | ||
2842 | "xor %%"REG_a", %%"REG_a" \n\t" // i | ||
2843 | "mov %5, %%"REG_c" \n\t" // src | ||
2844 | "mov %1, %%"REG_D" \n\t" // buf1 | ||
2845 | "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t" | ||
2846 | PREFETCH" (%%"REG_c") \n\t" | ||
2847 | PREFETCH" 32(%%"REG_c") \n\t" | ||
2848 | PREFETCH" 64(%%"REG_c") \n\t" | ||
2849 | |||
2850 | FUNNY_UV_CODE | ||
2851 | FUNNY_UV_CODE | ||
2852 | FUNNY_UV_CODE | ||
2853 | FUNNY_UV_CODE | ||
2854 | |||
2855 | #if defined(PIC) | ||
2856 | "mov %6, %%"REG_b" \n\t" | ||
2857 | #endif | ||
2858 | :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), | ||
2859 | "m" (funnyUVCode), "m" (src2) | ||
2860 | #if defined(PIC) | ||
2861 | ,"m" (ebxsave) | ||
2862 | #endif | ||
2863 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | ||
2864 | #if !defined(PIC) | ||
2865 | ,"%"REG_b | ||
2866 | #endif | ||
2867 | ); | ||
2868 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) | ||
2869 | { | ||
2870 | //printf("%d %d %d\n", dstWidth, i, srcW); | ||
2871 | dst[i] = src1[srcW-1]*128; | ||
2872 | dst[i+VOFW] = src2[srcW-1]*128; | ||
2873 | } | ||
2874 | } | ||
2875 | else | ||
2876 | { | ||
2877 | #endif /* HAVE_MMX2 */ | ||
2878 | long xInc_shr16 = (long) (xInc >> 16); | ||
2879 | uint16_t xInc_mask = xInc & 0xffff; | ||
2880 | asm volatile( | ||
2881 | "xor %%"REG_a", %%"REG_a" \n\t" // i | ||
2882 | "xor %%"REG_d", %%"REG_d" \n\t" // xx | ||
2883 | "xorl %%ecx, %%ecx \n\t" // 2*xalpha | ||
2884 | ASMALIGN(4) | ||
2885 | "1: \n\t" | ||
2886 | "mov %0, %%"REG_S" \n\t" | ||
2887 | "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] | ||
2888 | "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] | ||
2889 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||
2890 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||
2891 | "shll $16, %%edi \n\t" | ||
2892 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||
2893 | "mov %1, %%"REG_D" \n\t" | ||
2894 | "shrl $9, %%esi \n\t" | ||
2895 | "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" | ||
2896 | |||
2897 | "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] | ||
2898 | "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] | ||
2899 | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||
2900 | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||
2901 | "shll $16, %%edi \n\t" | ||
2902 | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||
2903 | "mov %1, %%"REG_D" \n\t" | ||
2904 | "shrl $9, %%esi \n\t" | ||
2905 | "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" | ||
2906 | |||
2907 | "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF | ||
2908 | "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry | ||
2909 | "add $1, %%"REG_a" \n\t" | ||
2910 | "cmp %2, %%"REG_a" \n\t" | ||
2911 | " jb 1b \n\t" | ||
2912 | |||
2913 | /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, | ||
2914 | which is needed to support GCC 4.0. */ | ||
2915 | #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) | ||
2916 | :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | ||
2917 | #else | ||
2918 | :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), | ||
2919 | #endif | ||
2920 | "r" (src2) | ||
2921 | : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" | ||
2922 | ); | ||
2923 | #ifdef HAVE_MMX2 | ||
2924 | } //if MMX2 can't be used | ||
2925 | #endif | ||
2926 | #else | ||
2927 | int i; | ||
2928 | unsigned int xpos=0; | ||
2929 | for (i=0;i<dstWidth;i++) | ||
2930 | { | ||
2931 | register unsigned int xx=xpos>>16; | ||
2932 | register unsigned int xalpha=(xpos&0xFFFF)>>9; | ||
2933 | dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | ||
2934 | dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | ||
2935 | /* slower | ||
2936 | dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; | ||
2937 | dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; | ||
2938 | */ | ||
2939 | xpos+=xInc; | ||
2940 | } | ||
2941 | #endif /* defined(ARCH_X86) */ | ||
2942 | } | ||
2943 | } | ||
2944 | |||
2945 | static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
2946 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
2947 | |||
2948 | /* load a few things into local vars to make the code more readable? and faster */ | ||
2949 | const int srcW= c->srcW; | ||
2950 | const int dstW= c->dstW; | ||
2951 | const int dstH= c->dstH; | ||
2952 | const int chrDstW= c->chrDstW; | ||
2953 | const int chrSrcW= c->chrSrcW; | ||
2954 | const int lumXInc= c->lumXInc; | ||
2955 | const int chrXInc= c->chrXInc; | ||
2956 | const int dstFormat= c->dstFormat; | ||
2957 | const int srcFormat= c->srcFormat; | ||
2958 | const int flags= c->flags; | ||
2959 | const int canMMX2BeUsed= c->canMMX2BeUsed; | ||
2960 | int16_t *vLumFilterPos= c->vLumFilterPos; | ||
2961 | int16_t *vChrFilterPos= c->vChrFilterPos; | ||
2962 | int16_t *hLumFilterPos= c->hLumFilterPos; | ||
2963 | int16_t *hChrFilterPos= c->hChrFilterPos; | ||
2964 | int16_t *vLumFilter= c->vLumFilter; | ||
2965 | int16_t *vChrFilter= c->vChrFilter; | ||
2966 | int16_t *hLumFilter= c->hLumFilter; | ||
2967 | int16_t *hChrFilter= c->hChrFilter; | ||
2968 | int32_t *lumMmxFilter= c->lumMmxFilter; | ||
2969 | int32_t *chrMmxFilter= c->chrMmxFilter; | ||
2970 | const int vLumFilterSize= c->vLumFilterSize; | ||
2971 | const int vChrFilterSize= c->vChrFilterSize; | ||
2972 | const int hLumFilterSize= c->hLumFilterSize; | ||
2973 | const int hChrFilterSize= c->hChrFilterSize; | ||
2974 | int16_t **lumPixBuf= c->lumPixBuf; | ||
2975 | int16_t **chrPixBuf= c->chrPixBuf; | ||
2976 | const int vLumBufSize= c->vLumBufSize; | ||
2977 | const int vChrBufSize= c->vChrBufSize; | ||
2978 | uint8_t *funnyYCode= c->funnyYCode; | ||
2979 | uint8_t *funnyUVCode= c->funnyUVCode; | ||
2980 | uint8_t *formatConvBuffer= c->formatConvBuffer; | ||
2981 | const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample; | ||
2982 | const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample); | ||
2983 | int lastDstY; | ||
2984 | uint8_t *pal=NULL; | ||
2985 | |||
2986 | /* vars which will change and which we need to store back in the context */ | ||
2987 | int dstY= c->dstY; | ||
2988 | int lumBufIndex= c->lumBufIndex; | ||
2989 | int chrBufIndex= c->chrBufIndex; | ||
2990 | int lastInLumBuf= c->lastInLumBuf; | ||
2991 | int lastInChrBuf= c->lastInChrBuf; | ||
2992 | |||
2993 | if (isPacked(c->srcFormat)){ | ||
2994 | pal= src[1]; | ||
2995 | src[0]= | ||
2996 | src[1]= | ||
2997 | src[2]= src[0]; | ||
2998 | srcStride[0]= | ||
2999 | srcStride[1]= | ||
3000 | srcStride[2]= srcStride[0]; | ||
3001 | } | ||
3002 | srcStride[1]<<= c->vChrDrop; | ||
3003 | srcStride[2]<<= c->vChrDrop; | ||
3004 | |||
3005 | //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], | ||
3006 | // (int)dst[0], (int)dst[1], (int)dst[2]); | ||
3007 | |||
3008 | #if 0 //self test FIXME move to a vfilter or something | ||
3009 | { | ||
3010 | static volatile int i=0; | ||
3011 | i++; | ||
3012 | if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) | ||
3013 | selfTest(src, srcStride, c->srcW, c->srcH); | ||
3014 | i--; | ||
3015 | } | ||
3016 | #endif | ||
3017 | |||
3018 | //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], | ||
3019 | //dstStride[0],dstStride[1],dstStride[2]); | ||
3020 | |||
3021 | if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0) | ||
3022 | { | ||
3023 | static int firstTime=1; //FIXME move this into the context perhaps | ||
3024 | if (flags & SWS_PRINT_INFO && firstTime) | ||
3025 | { | ||
3026 | av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" | ||
3027 | " ->cannot do aligned memory accesses anymore\n"); | ||
3028 | firstTime=0; | ||
3029 | } | ||
3030 | } | ||
3031 | |||
3032 | /* Note the user might start scaling the picture in the middle so this | ||
3033 | will not get executed. This is not really intended but works | ||
3034 | currently, so people might do it. */ | ||
3035 | if (srcSliceY ==0){ | ||
3036 | lumBufIndex=0; | ||
3037 | chrBufIndex=0; | ||
3038 | dstY=0; | ||
3039 | lastInLumBuf= -1; | ||
3040 | lastInChrBuf= -1; | ||
3041 | } | ||
3042 | |||
3043 | lastDstY= dstY; | ||
3044 | |||
3045 | for (;dstY < dstH; dstY++){ | ||
3046 | unsigned char *dest =dst[0]+dstStride[0]*dstY; | ||
3047 | const int chrDstY= dstY>>c->chrDstVSubSample; | ||
3048 | unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; | ||
3049 | unsigned char *vDest=dst[2]+dstStride[2]*chrDstY; | ||
3050 | |||
3051 | const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input | ||
3052 | const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input | ||
3053 | const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input | ||
3054 | const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input | ||
3055 | |||
3056 | //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", | ||
3057 | // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); | ||
3058 | //handle holes (FAST_BILINEAR & weird filters) | ||
3059 | if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; | ||
3060 | if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; | ||
3061 | //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); | ||
3062 | assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); | ||
3063 | assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); | ||
3064 | |||
3065 | // Do we have enough lines in this slice to output the dstY line | ||
3066 | if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) | ||
3067 | { | ||
3068 | //Do horizontal scaling | ||
3069 | while(lastInLumBuf < lastLumSrcY) | ||
3070 | { | ||
3071 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | ||
3072 | lumBufIndex++; | ||
3073 | //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); | ||
3074 | assert(lumBufIndex < 2*vLumBufSize); | ||
3075 | assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); | ||
3076 | assert(lastInLumBuf + 1 - srcSliceY >= 0); | ||
3077 | //printf("%d %d\n", lumBufIndex, vLumBufSize); | ||
3078 | RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | ||
3079 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | ||
3080 | funnyYCode, c->srcFormat, formatConvBuffer, | ||
3081 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); | ||
3082 | lastInLumBuf++; | ||
3083 | } | ||
3084 | while(lastInChrBuf < lastChrSrcY) | ||
3085 | { | ||
3086 | uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | ||
3087 | uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | ||
3088 | chrBufIndex++; | ||
3089 | assert(chrBufIndex < 2*vChrBufSize); | ||
3090 | assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); | ||
3091 | assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); | ||
3092 | //FIXME replace parameters through context struct (some at least) | ||
3093 | |||
3094 | if (!(isGray(srcFormat) || isGray(dstFormat))) | ||
3095 | RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | ||
3096 | flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | ||
3097 | funnyUVCode, c->srcFormat, formatConvBuffer, | ||
3098 | c->chrMmx2Filter, c->chrMmx2FilterPos, pal); | ||
3099 | lastInChrBuf++; | ||
3100 | } | ||
3101 | //wrap buf index around to stay inside the ring buffer | ||
3102 | if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; | ||
3103 | if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; | ||
3104 | } | ||
3105 | else // not enough lines left in this slice -> load the rest in the buffer | ||
3106 | { | ||
3107 | /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", | ||
3108 | firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, | ||
3109 | lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, | ||
3110 | vChrBufSize, vLumBufSize);*/ | ||
3111 | |||
3112 | //Do horizontal scaling | ||
3113 | while(lastInLumBuf+1 < srcSliceY + srcSliceH) | ||
3114 | { | ||
3115 | uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; | ||
3116 | lumBufIndex++; | ||
3117 | assert(lumBufIndex < 2*vLumBufSize); | ||
3118 | assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); | ||
3119 | assert(lastInLumBuf + 1 - srcSliceY >= 0); | ||
3120 | RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc, | ||
3121 | flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize, | ||
3122 | funnyYCode, c->srcFormat, formatConvBuffer, | ||
3123 | c->lumMmx2Filter, c->lumMmx2FilterPos, pal); | ||
3124 | lastInLumBuf++; | ||
3125 | } | ||
3126 | while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) | ||
3127 | { | ||
3128 | uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; | ||
3129 | uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; | ||
3130 | chrBufIndex++; | ||
3131 | assert(chrBufIndex < 2*vChrBufSize); | ||
3132 | assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH); | ||
3133 | assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); | ||
3134 | |||
3135 | if (!(isGray(srcFormat) || isGray(dstFormat))) | ||
3136 | RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, | ||
3137 | flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize, | ||
3138 | funnyUVCode, c->srcFormat, formatConvBuffer, | ||
3139 | c->chrMmx2Filter, c->chrMmx2FilterPos, pal); | ||
3140 | lastInChrBuf++; | ||
3141 | } | ||
3142 | //wrap buf index around to stay inside the ring buffer | ||
3143 | if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; | ||
3144 | if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; | ||
3145 | break; //we can't output a dstY line so let's try with the next slice | ||
3146 | } | ||
3147 | |||
3148 | #ifdef HAVE_MMX | ||
3149 | b5Dither= ff_dither8[dstY&1]; | ||
3150 | g6Dither= ff_dither4[dstY&1]; | ||
3151 | g5Dither= ff_dither8[dstY&1]; | ||
3152 | r5Dither= ff_dither8[(dstY+1)&1]; | ||
3153 | #endif | ||
3154 | if (dstY < dstH-2) | ||
3155 | { | ||
3156 | int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | ||
3157 | int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | ||
3158 | #ifdef HAVE_MMX | ||
3159 | int i; | ||
3160 | if (flags & SWS_ACCURATE_RND){ | ||
3161 | for (i=0; i<vLumFilterSize; i+=2){ | ||
3162 | lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i ]; | ||
3163 | lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)]; | ||
3164 | lumMmxFilter[2*i+2]= | ||
3165 | lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i ] | ||
3166 | + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); | ||
3167 | } | ||
3168 | for (i=0; i<vChrFilterSize; i+=2){ | ||
3169 | chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i ]; | ||
3170 | chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)]; | ||
3171 | chrMmxFilter[2*i+2]= | ||
3172 | chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i ] | ||
3173 | + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); | ||
3174 | } | ||
3175 | }else{ | ||
3176 | for (i=0; i<vLumFilterSize; i++) | ||
3177 | { | ||
3178 | lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i]; | ||
3179 | lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32; | ||
3180 | lumMmxFilter[4*i+2]= | ||
3181 | lumMmxFilter[4*i+3]= | ||
3182 | ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; | ||
3183 | } | ||
3184 | for (i=0; i<vChrFilterSize; i++) | ||
3185 | { | ||
3186 | chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i]; | ||
3187 | chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32; | ||
3188 | chrMmxFilter[4*i+2]= | ||
3189 | chrMmxFilter[4*i+3]= | ||
3190 | ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; | ||
3191 | } | ||
3192 | } | ||
3193 | #endif | ||
3194 | if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ | ||
3195 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | ||
3196 | if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | ||
3197 | RENAME(yuv2nv12X)(c, | ||
3198 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | ||
3199 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | ||
3200 | dest, uDest, dstW, chrDstW, dstFormat); | ||
3201 | } | ||
3202 | else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like | ||
3203 | { | ||
3204 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | ||
3205 | if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | ||
3206 | if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12 | ||
3207 | { | ||
3208 | int16_t *lumBuf = lumPixBuf[0]; | ||
3209 | int16_t *chrBuf= chrPixBuf[0]; | ||
3210 | RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW); | ||
3211 | } | ||
3212 | else //General YV12 | ||
3213 | { | ||
3214 | RENAME(yuv2yuvX)(c, | ||
3215 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | ||
3216 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | ||
3217 | dest, uDest, vDest, dstW, chrDstW); | ||
3218 | } | ||
3219 | } | ||
3220 | else | ||
3221 | { | ||
3222 | assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | ||
3223 | assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | ||
3224 | if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB | ||
3225 | { | ||
3226 | int chrAlpha= vChrFilter[2*dstY+1]; | ||
3227 | RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), | ||
3228 | dest, dstW, chrAlpha, dstFormat, flags, dstY); | ||
3229 | } | ||
3230 | else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB | ||
3231 | { | ||
3232 | int lumAlpha= vLumFilter[2*dstY+1]; | ||
3233 | int chrAlpha= vChrFilter[2*dstY+1]; | ||
3234 | lumMmxFilter[2]= | ||
3235 | lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; | ||
3236 | chrMmxFilter[2]= | ||
3237 | chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; | ||
3238 | RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), | ||
3239 | dest, dstW, lumAlpha, chrAlpha, dstY); | ||
3240 | } | ||
3241 | else //general RGB | ||
3242 | { | ||
3243 | RENAME(yuv2packedX)(c, | ||
3244 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | ||
3245 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | ||
3246 | dest, dstW, dstY); | ||
3247 | } | ||
3248 | } | ||
3249 | } | ||
3250 | else // hmm looks like we can't use MMX here without overwriting this array's tail | ||
3251 | { | ||
3252 | int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; | ||
3253 | int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; | ||
3254 | if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ | ||
3255 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | ||
3256 | if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi | ||
3257 | yuv2nv12XinC( | ||
3258 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | ||
3259 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | ||
3260 | dest, uDest, dstW, chrDstW, dstFormat); | ||
3261 | } | ||
3262 | else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 | ||
3263 | { | ||
3264 | const int chrSkipMask= (1<<c->chrDstVSubSample)-1; | ||
3265 | if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi | ||
3266 | yuv2yuvXinC( | ||
3267 | vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, | ||
3268 | vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | ||
3269 | dest, uDest, vDest, dstW, chrDstW); | ||
3270 | } | ||
3271 | else | ||
3272 | { | ||
3273 | assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); | ||
3274 | assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); | ||
3275 | yuv2packedXinC(c, | ||
3276 | vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, | ||
3277 | vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, | ||
3278 | dest, dstW, dstY); | ||
3279 | } | ||
3280 | } | ||
3281 | } | ||
3282 | |||
3283 | #ifdef HAVE_MMX | ||
3284 | asm volatile(SFENCE:::"memory"); | ||
3285 | asm volatile(EMMS:::"memory"); | ||
3286 | #endif | ||
3287 | /* store changed local vars back in the context */ | ||
3288 | c->dstY= dstY; | ||
3289 | c->lumBufIndex= lumBufIndex; | ||
3290 | c->chrBufIndex= chrBufIndex; | ||
3291 | c->lastInLumBuf= lastInLumBuf; | ||
3292 | c->lastInChrBuf= lastInChrBuf; | ||
3293 | |||
3294 | return dstY - lastDstY; | ||
3295 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb.c b/src/plugins/ffmpeg/libswscale/yuv2rgb.c deleted file mode 100644 index f0613a8..0000000 --- a/src/plugins/ffmpeg/libswscale/yuv2rgb.c +++ /dev/null | |||
@@ -1,887 +0,0 @@ | |||
1 | /* | ||
2 | * yuv2rgb.c, Software YUV to RGB converter | ||
3 | * | ||
4 | * Copyright (C) 1999, Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | ||
5 | * | ||
6 | * Functions broken out from display_x11.c and several new modes | ||
7 | * added by HÃ¥kan Hjort <d95hjort@dtek.chalmers.se> | ||
8 | * | ||
9 | * 15 & 16 bpp support by Franck Sicard <Franck.Sicard@solsoft.fr> | ||
10 | * | ||
11 | * MMX/MMX2 template stuff (needed for fast movntq support), | ||
12 | * 1,4,8bpp support and context / deglobalize stuff | ||
13 | * by Michael Niedermayer (michaelni@gmx.at) | ||
14 | * | ||
15 | * This file is part of mpeg2dec, a free MPEG-2 video decoder | ||
16 | * | ||
17 | * mpeg2dec is free software; you can redistribute it and/or modify | ||
18 | * it under the terms of the GNU General Public License as published by | ||
19 | * the Free Software Foundation; either version 2, or (at your option) | ||
20 | * any later version. | ||
21 | * | ||
22 | * mpeg2dec is distributed in the hope that it will be useful, | ||
23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
25 | * GNU General Public License for more details. | ||
26 | * | ||
27 | * You should have received a copy of the GNU General Public License | ||
28 | * along with mpeg2dec; if not, write to the Free Software | ||
29 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
30 | */ | ||
31 | |||
32 | #include <stdio.h> | ||
33 | #include <stdlib.h> | ||
34 | #include <inttypes.h> | ||
35 | #include <assert.h> | ||
36 | |||
37 | #include "config.h" | ||
38 | #include "rgb2rgb.h" | ||
39 | #include "swscale.h" | ||
40 | #include "swscale_internal.h" | ||
41 | |||
42 | #define DITHER1XBPP // only for MMX | ||
43 | |||
44 | const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ | ||
45 | { 1, 3, 1, 3, 1, 3, 1, 3, }, | ||
46 | { 2, 0, 2, 0, 2, 0, 2, 0, }, | ||
47 | }; | ||
48 | |||
49 | const uint8_t __attribute__((aligned(8))) dither_2x2_8[2][8]={ | ||
50 | { 6, 2, 6, 2, 6, 2, 6, 2, }, | ||
51 | { 0, 4, 0, 4, 0, 4, 0, 4, }, | ||
52 | }; | ||
53 | |||
54 | const uint8_t __attribute__((aligned(8))) dither_8x8_32[8][8]={ | ||
55 | { 17, 9, 23, 15, 16, 8, 22, 14, }, | ||
56 | { 5, 29, 3, 27, 4, 28, 2, 26, }, | ||
57 | { 21, 13, 19, 11, 20, 12, 18, 10, }, | ||
58 | { 0, 24, 6, 30, 1, 25, 7, 31, }, | ||
59 | { 16, 8, 22, 14, 17, 9, 23, 15, }, | ||
60 | { 4, 28, 2, 26, 5, 29, 3, 27, }, | ||
61 | { 20, 12, 18, 10, 21, 13, 19, 11, }, | ||
62 | { 1, 25, 7, 31, 0, 24, 6, 30, }, | ||
63 | }; | ||
64 | |||
65 | #if 0 | ||
66 | const uint8_t __attribute__((aligned(8))) dither_8x8_64[8][8]={ | ||
67 | { 0, 48, 12, 60, 3, 51, 15, 63, }, | ||
68 | { 32, 16, 44, 28, 35, 19, 47, 31, }, | ||
69 | { 8, 56, 4, 52, 11, 59, 7, 55, }, | ||
70 | { 40, 24, 36, 20, 43, 27, 39, 23, }, | ||
71 | { 2, 50, 14, 62, 1, 49, 13, 61, }, | ||
72 | { 34, 18, 46, 30, 33, 17, 45, 29, }, | ||
73 | { 10, 58, 6, 54, 9, 57, 5, 53, }, | ||
74 | { 42, 26, 38, 22, 41, 25, 37, 21, }, | ||
75 | }; | ||
76 | #endif | ||
77 | |||
78 | const uint8_t __attribute__((aligned(8))) dither_8x8_73[8][8]={ | ||
79 | { 0, 55, 14, 68, 3, 58, 17, 72, }, | ||
80 | { 37, 18, 50, 32, 40, 22, 54, 35, }, | ||
81 | { 9, 64, 5, 59, 13, 67, 8, 63, }, | ||
82 | { 46, 27, 41, 23, 49, 31, 44, 26, }, | ||
83 | { 2, 57, 16, 71, 1, 56, 15, 70, }, | ||
84 | { 39, 21, 52, 34, 38, 19, 51, 33, }, | ||
85 | { 11, 66, 7, 62, 10, 65, 6, 60, }, | ||
86 | { 48, 30, 43, 25, 47, 29, 42, 24, }, | ||
87 | }; | ||
88 | |||
89 | #if 0 | ||
90 | const uint8_t __attribute__((aligned(8))) dither_8x8_128[8][8]={ | ||
91 | { 68, 36, 92, 60, 66, 34, 90, 58, }, | ||
92 | { 20, 116, 12, 108, 18, 114, 10, 106, }, | ||
93 | { 84, 52, 76, 44, 82, 50, 74, 42, }, | ||
94 | { 0, 96, 24, 120, 6, 102, 30, 126, }, | ||
95 | { 64, 32, 88, 56, 70, 38, 94, 62, }, | ||
96 | { 16, 112, 8, 104, 22, 118, 14, 110, }, | ||
97 | { 80, 48, 72, 40, 86, 54, 78, 46, }, | ||
98 | { 4, 100, 28, 124, 2, 98, 26, 122, }, | ||
99 | }; | ||
100 | #endif | ||
101 | |||
102 | #if 1 | ||
103 | const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={ | ||
104 | {117, 62, 158, 103, 113, 58, 155, 100, }, | ||
105 | { 34, 199, 21, 186, 31, 196, 17, 182, }, | ||
106 | {144, 89, 131, 76, 141, 86, 127, 72, }, | ||
107 | { 0, 165, 41, 206, 10, 175, 52, 217, }, | ||
108 | {110, 55, 151, 96, 120, 65, 162, 107, }, | ||
109 | { 28, 193, 14, 179, 38, 203, 24, 189, }, | ||
110 | {138, 83, 124, 69, 148, 93, 134, 79, }, | ||
111 | { 7, 172, 48, 213, 3, 168, 45, 210, }, | ||
112 | }; | ||
113 | #elif 1 | ||
114 | // tries to correct a gamma of 1.5 | ||
115 | const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={ | ||
116 | { 0, 143, 18, 200, 2, 156, 25, 215, }, | ||
117 | { 78, 28, 125, 64, 89, 36, 138, 74, }, | ||
118 | { 10, 180, 3, 161, 16, 195, 8, 175, }, | ||
119 | {109, 51, 93, 38, 121, 60, 105, 47, }, | ||
120 | { 1, 152, 23, 210, 0, 147, 20, 205, }, | ||
121 | { 85, 33, 134, 71, 81, 30, 130, 67, }, | ||
122 | { 14, 190, 6, 171, 12, 185, 5, 166, }, | ||
123 | {117, 57, 101, 44, 113, 54, 97, 41, }, | ||
124 | }; | ||
125 | #elif 1 | ||
126 | // tries to correct a gamma of 2.0 | ||
127 | const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={ | ||
128 | { 0, 124, 8, 193, 0, 140, 12, 213, }, | ||
129 | { 55, 14, 104, 42, 66, 19, 119, 52, }, | ||
130 | { 3, 168, 1, 145, 6, 187, 3, 162, }, | ||
131 | { 86, 31, 70, 21, 99, 39, 82, 28, }, | ||
132 | { 0, 134, 11, 206, 0, 129, 9, 200, }, | ||
133 | { 62, 17, 114, 48, 58, 16, 109, 45, }, | ||
134 | { 5, 181, 2, 157, 4, 175, 1, 151, }, | ||
135 | { 95, 36, 78, 26, 90, 34, 74, 24, }, | ||
136 | }; | ||
137 | #else | ||
138 | // tries to correct a gamma of 2.5 | ||
139 | const uint8_t __attribute__((aligned(8))) dither_8x8_220[8][8]={ | ||
140 | { 0, 107, 3, 187, 0, 125, 6, 212, }, | ||
141 | { 39, 7, 86, 28, 49, 11, 102, 36, }, | ||
142 | { 1, 158, 0, 131, 3, 180, 1, 151, }, | ||
143 | { 68, 19, 52, 12, 81, 25, 64, 17, }, | ||
144 | { 0, 119, 5, 203, 0, 113, 4, 195, }, | ||
145 | { 45, 9, 96, 33, 42, 8, 91, 30, }, | ||
146 | { 2, 172, 1, 144, 2, 165, 0, 137, }, | ||
147 | { 77, 23, 60, 15, 72, 21, 56, 14, }, | ||
148 | }; | ||
149 | #endif | ||
150 | |||
151 | #ifdef HAVE_MMX | ||
152 | |||
153 | /* hope these constant values are cache line aligned */ | ||
154 | DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL; | ||
155 | DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL; | ||
156 | DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL; | ||
157 | |||
158 | // The volatile is required because gcc otherwise optimizes some writes away | ||
159 | // not knowing that these are read in the ASM block. | ||
160 | static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither; | ||
161 | static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither; | ||
162 | static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; | ||
163 | static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither; | ||
164 | |||
165 | #undef HAVE_MMX | ||
166 | |||
167 | //MMX versions | ||
168 | #undef RENAME | ||
169 | #define HAVE_MMX | ||
170 | #undef HAVE_MMX2 | ||
171 | #undef HAVE_3DNOW | ||
172 | #define RENAME(a) a ## _MMX | ||
173 | #include "yuv2rgb_template.c" | ||
174 | |||
175 | //MMX2 versions | ||
176 | #undef RENAME | ||
177 | #define HAVE_MMX | ||
178 | #define HAVE_MMX2 | ||
179 | #undef HAVE_3DNOW | ||
180 | #define RENAME(a) a ## _MMX2 | ||
181 | #include "yuv2rgb_template.c" | ||
182 | |||
183 | #endif /* HAVE_MMX */ | ||
184 | |||
185 | const int32_t Inverse_Table_6_9[8][4] = { | ||
186 | {117504, 138453, 13954, 34903}, /* no sequence_display_extension */ | ||
187 | {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */ | ||
188 | {104597, 132201, 25675, 53279}, /* unspecified */ | ||
189 | {104597, 132201, 25675, 53279}, /* reserved */ | ||
190 | {104448, 132798, 24759, 53109}, /* FCC */ | ||
191 | {104597, 132201, 25675, 53279}, /* ITU-R Rec. 624-4 System B, G */ | ||
192 | {104597, 132201, 25675, 53279}, /* SMPTE 170M */ | ||
193 | {117579, 136230, 16907, 35559} /* SMPTE 240M (1987) */ | ||
194 | }; | ||
195 | |||
196 | #define RGB(i) \ | ||
197 | U = pu[i]; \ | ||
198 | V = pv[i]; \ | ||
199 | r = (void *)c->table_rV[V]; \ | ||
200 | g = (void *)(c->table_gU[U] + c->table_gV[V]); \ | ||
201 | b = (void *)c->table_bU[U]; | ||
202 | |||
203 | #define DST1(i) \ | ||
204 | Y = py_1[2*i]; \ | ||
205 | dst_1[2*i] = r[Y] + g[Y] + b[Y]; \ | ||
206 | Y = py_1[2*i+1]; \ | ||
207 | dst_1[2*i+1] = r[Y] + g[Y] + b[Y]; | ||
208 | |||
209 | #define DST2(i) \ | ||
210 | Y = py_2[2*i]; \ | ||
211 | dst_2[2*i] = r[Y] + g[Y] + b[Y]; \ | ||
212 | Y = py_2[2*i+1]; \ | ||
213 | dst_2[2*i+1] = r[Y] + g[Y] + b[Y]; | ||
214 | |||
215 | #define DST1RGB(i) \ | ||
216 | Y = py_1[2*i]; \ | ||
217 | dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y]; \ | ||
218 | Y = py_1[2*i+1]; \ | ||
219 | dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y]; | ||
220 | |||
221 | #define DST2RGB(i) \ | ||
222 | Y = py_2[2*i]; \ | ||
223 | dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y]; \ | ||
224 | Y = py_2[2*i+1]; \ | ||
225 | dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y]; | ||
226 | |||
227 | #define DST1BGR(i) \ | ||
228 | Y = py_1[2*i]; \ | ||
229 | dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y]; \ | ||
230 | Y = py_1[2*i+1]; \ | ||
231 | dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y]; | ||
232 | |||
233 | #define DST2BGR(i) \ | ||
234 | Y = py_2[2*i]; \ | ||
235 | dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y]; \ | ||
236 | Y = py_2[2*i+1]; \ | ||
237 | dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y]; | ||
238 | |||
239 | #define PROLOG(func_name, dst_type) \ | ||
240 | static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \ | ||
241 | int srcSliceH, uint8_t* dst[], int dstStride[]){\ | ||
242 | int y;\ | ||
243 | \ | ||
244 | if (c->srcFormat == PIX_FMT_YUV422P){\ | ||
245 | srcStride[1] *= 2;\ | ||
246 | srcStride[2] *= 2;\ | ||
247 | }\ | ||
248 | for (y=0; y<srcSliceH; y+=2){\ | ||
249 | dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY )*dstStride[0]);\ | ||
250 | dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\ | ||
251 | dst_type av_unused *r, *b;\ | ||
252 | dst_type *g;\ | ||
253 | uint8_t *py_1= src[0] + y*srcStride[0];\ | ||
254 | uint8_t *py_2= py_1 + srcStride[0];\ | ||
255 | uint8_t *pu= src[1] + (y>>1)*srcStride[1];\ | ||
256 | uint8_t *pv= src[2] + (y>>1)*srcStride[2];\ | ||
257 | unsigned int h_size= c->dstW>>3;\ | ||
258 | while (h_size--) {\ | ||
259 | int av_unused U, V;\ | ||
260 | int Y;\ | ||
261 | |||
262 | #define EPILOG1(dst_delta)\ | ||
263 | pu += 4;\ | ||
264 | pv += 4;\ | ||
265 | py_1 += 8;\ | ||
266 | py_2 += 8;\ | ||
267 | dst_1 += dst_delta;\ | ||
268 | dst_2 += dst_delta;\ | ||
269 | }\ | ||
270 | if (c->dstW & 4) {\ | ||
271 | int av_unused Y, U, V;\ | ||
272 | |||
273 | #define EPILOG2()\ | ||
274 | }\ | ||
275 | }\ | ||
276 | return srcSliceH;\ | ||
277 | } | ||
278 | |||
279 | #define EPILOG(dst_delta)\ | ||
280 | EPILOG1(dst_delta)\ | ||
281 | EPILOG2() | ||
282 | |||
283 | PROLOG(yuv2rgb_c_32, uint32_t) | ||
284 | RGB(0); | ||
285 | DST1(0); | ||
286 | DST2(0); | ||
287 | |||
288 | RGB(1); | ||
289 | DST2(1); | ||
290 | DST1(1); | ||
291 | |||
292 | RGB(2); | ||
293 | DST1(2); | ||
294 | DST2(2); | ||
295 | |||
296 | RGB(3); | ||
297 | DST2(3); | ||
298 | DST1(3); | ||
299 | EPILOG1(8) | ||
300 | RGB(0); | ||
301 | DST1(0); | ||
302 | DST2(0); | ||
303 | |||
304 | RGB(1); | ||
305 | DST2(1); | ||
306 | DST1(1); | ||
307 | EPILOG2() | ||
308 | |||
309 | PROLOG(yuv2rgb_c_24_rgb, uint8_t) | ||
310 | RGB(0); | ||
311 | DST1RGB(0); | ||
312 | DST2RGB(0); | ||
313 | |||
314 | RGB(1); | ||
315 | DST2RGB(1); | ||
316 | DST1RGB(1); | ||
317 | |||
318 | RGB(2); | ||
319 | DST1RGB(2); | ||
320 | DST2RGB(2); | ||
321 | |||
322 | RGB(3); | ||
323 | DST2RGB(3); | ||
324 | DST1RGB(3); | ||
325 | EPILOG1(24) | ||
326 | RGB(0); | ||
327 | DST1RGB(0); | ||
328 | DST2RGB(0); | ||
329 | |||
330 | RGB(1); | ||
331 | DST2RGB(1); | ||
332 | DST1RGB(1); | ||
333 | EPILOG2() | ||
334 | |||
335 | // only trivial mods from yuv2rgb_c_24_rgb | ||
336 | PROLOG(yuv2rgb_c_24_bgr, uint8_t) | ||
337 | RGB(0); | ||
338 | DST1BGR(0); | ||
339 | DST2BGR(0); | ||
340 | |||
341 | RGB(1); | ||
342 | DST2BGR(1); | ||
343 | DST1BGR(1); | ||
344 | |||
345 | RGB(2); | ||
346 | DST1BGR(2); | ||
347 | DST2BGR(2); | ||
348 | |||
349 | RGB(3); | ||
350 | DST2BGR(3); | ||
351 | DST1BGR(3); | ||
352 | EPILOG1(24) | ||
353 | RGB(0); | ||
354 | DST1BGR(0); | ||
355 | DST2BGR(0); | ||
356 | |||
357 | RGB(1); | ||
358 | DST2BGR(1); | ||
359 | DST1BGR(1); | ||
360 | EPILOG2() | ||
361 | |||
362 | // This is exactly the same code as yuv2rgb_c_32 except for the types of | ||
363 | // r, g, b, dst_1, dst_2 | ||
364 | PROLOG(yuv2rgb_c_16, uint16_t) | ||
365 | RGB(0); | ||
366 | DST1(0); | ||
367 | DST2(0); | ||
368 | |||
369 | RGB(1); | ||
370 | DST2(1); | ||
371 | DST1(1); | ||
372 | |||
373 | RGB(2); | ||
374 | DST1(2); | ||
375 | DST2(2); | ||
376 | |||
377 | RGB(3); | ||
378 | DST2(3); | ||
379 | DST1(3); | ||
380 | EPILOG(8) | ||
381 | |||
382 | #if HAVE_DEAD_CODE | ||
383 | // This is exactly the same code as yuv2rgb_c_32 except for the types of | ||
384 | // r, g, b, dst_1, dst_2 | ||
385 | PROLOG(yuv2rgb_c_8, uint8_t) | ||
386 | RGB(0); | ||
387 | DST1(0); | ||
388 | DST2(0); | ||
389 | |||
390 | RGB(1); | ||
391 | DST2(1); | ||
392 | DST1(1); | ||
393 | |||
394 | RGB(2); | ||
395 | DST1(2); | ||
396 | DST2(2); | ||
397 | |||
398 | RGB(3); | ||
399 | DST2(3); | ||
400 | DST1(3); | ||
401 | EPILOG(8) | ||
402 | #endif | ||
403 | |||
404 | // r, g, b, dst_1, dst_2 | ||
405 | PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t) | ||
406 | const uint8_t *d32= dither_8x8_32[y&7]; | ||
407 | const uint8_t *d64= dither_8x8_73[y&7]; | ||
408 | #define DST1bpp8(i,o) \ | ||
409 | Y = py_1[2*i]; \ | ||
410 | dst_1[2*i] = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \ | ||
411 | Y = py_1[2*i+1]; \ | ||
412 | dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]]; | ||
413 | |||
414 | #define DST2bpp8(i,o) \ | ||
415 | Y = py_2[2*i]; \ | ||
416 | dst_2[2*i] = r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]]; \ | ||
417 | Y = py_2[2*i+1]; \ | ||
418 | dst_2[2*i+1] = r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]]; | ||
419 | |||
420 | |||
421 | RGB(0); | ||
422 | DST1bpp8(0,0); | ||
423 | DST2bpp8(0,0); | ||
424 | |||
425 | RGB(1); | ||
426 | DST2bpp8(1,2); | ||
427 | DST1bpp8(1,2); | ||
428 | |||
429 | RGB(2); | ||
430 | DST1bpp8(2,4); | ||
431 | DST2bpp8(2,4); | ||
432 | |||
433 | RGB(3); | ||
434 | DST2bpp8(3,6); | ||
435 | DST1bpp8(3,6); | ||
436 | EPILOG(8) | ||
437 | |||
438 | |||
439 | // This is exactly the same code as yuv2rgb_c_32 except for the types of | ||
440 | // r, g, b, dst_1, dst_2 | ||
441 | #if HAVE_DEAD_CODE | ||
442 | PROLOG(yuv2rgb_c_4, uint8_t) | ||
443 | int acc; | ||
444 | #define DST1_4(i) \ | ||
445 | Y = py_1[2*i]; \ | ||
446 | acc = r[Y] + g[Y] + b[Y]; \ | ||
447 | Y = py_1[2*i+1]; \ | ||
448 | acc |= (r[Y] + g[Y] + b[Y])<<4; \ | ||
449 | dst_1[i] = acc; | ||
450 | |||
451 | #define DST2_4(i) \ | ||
452 | Y = py_2[2*i]; \ | ||
453 | acc = r[Y] + g[Y] + b[Y]; \ | ||
454 | Y = py_2[2*i+1]; \ | ||
455 | acc |= (r[Y] + g[Y] + b[Y])<<4; \ | ||
456 | dst_2[i] = acc; | ||
457 | |||
458 | RGB(0); | ||
459 | DST1_4(0); | ||
460 | DST2_4(0); | ||
461 | |||
462 | RGB(1); | ||
463 | DST2_4(1); | ||
464 | DST1_4(1); | ||
465 | |||
466 | RGB(2); | ||
467 | DST1_4(2); | ||
468 | DST2_4(2); | ||
469 | |||
470 | RGB(3); | ||
471 | DST2_4(3); | ||
472 | DST1_4(3); | ||
473 | EPILOG(4) | ||
474 | #endif | ||
475 | |||
476 | PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t) | ||
477 | const uint8_t *d64= dither_8x8_73[y&7]; | ||
478 | const uint8_t *d128=dither_8x8_220[y&7]; | ||
479 | int acc; | ||
480 | |||
481 | #define DST1bpp4(i,o) \ | ||
482 | Y = py_1[2*i]; \ | ||
483 | acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ | ||
484 | Y = py_1[2*i+1]; \ | ||
485 | acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4; \ | ||
486 | dst_1[i]= acc; | ||
487 | |||
488 | #define DST2bpp4(i,o) \ | ||
489 | Y = py_2[2*i]; \ | ||
490 | acc = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ | ||
491 | Y = py_2[2*i+1]; \ | ||
492 | acc |= (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4; \ | ||
493 | dst_2[i]= acc; | ||
494 | |||
495 | |||
496 | RGB(0); | ||
497 | DST1bpp4(0,0); | ||
498 | DST2bpp4(0,0); | ||
499 | |||
500 | RGB(1); | ||
501 | DST2bpp4(1,2); | ||
502 | DST1bpp4(1,2); | ||
503 | |||
504 | RGB(2); | ||
505 | DST1bpp4(2,4); | ||
506 | DST2bpp4(2,4); | ||
507 | |||
508 | RGB(3); | ||
509 | DST2bpp4(3,6); | ||
510 | DST1bpp4(3,6); | ||
511 | EPILOG(4) | ||
512 | |||
513 | // This is exactly the same code as yuv2rgb_c_32 except for the types of | ||
514 | // r, g, b, dst_1, dst_2 | ||
515 | #if HAVE_DEAD_CODE | ||
516 | PROLOG(yuv2rgb_c_4b, uint8_t) | ||
517 | RGB(0); | ||
518 | DST1(0); | ||
519 | DST2(0); | ||
520 | |||
521 | RGB(1); | ||
522 | DST2(1); | ||
523 | DST1(1); | ||
524 | |||
525 | RGB(2); | ||
526 | DST1(2); | ||
527 | DST2(2); | ||
528 | |||
529 | RGB(3); | ||
530 | DST2(3); | ||
531 | DST1(3); | ||
532 | EPILOG(8) | ||
533 | #endif | ||
534 | |||
535 | PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t) | ||
536 | const uint8_t *d64= dither_8x8_73[y&7]; | ||
537 | const uint8_t *d128=dither_8x8_220[y&7]; | ||
538 | |||
539 | #define DST1bpp4b(i,o) \ | ||
540 | Y = py_1[2*i]; \ | ||
541 | dst_1[2*i] = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \ | ||
542 | Y = py_1[2*i+1]; \ | ||
543 | dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]]; | ||
544 | |||
545 | #define DST2bpp4b(i,o) \ | ||
546 | Y = py_2[2*i]; \ | ||
547 | dst_2[2*i] = r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]]; \ | ||
548 | Y = py_2[2*i+1]; \ | ||
549 | dst_2[2*i+1] = r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]]; | ||
550 | |||
551 | |||
552 | RGB(0); | ||
553 | DST1bpp4b(0,0); | ||
554 | DST2bpp4b(0,0); | ||
555 | |||
556 | RGB(1); | ||
557 | DST2bpp4b(1,2); | ||
558 | DST1bpp4b(1,2); | ||
559 | |||
560 | RGB(2); | ||
561 | DST1bpp4b(2,4); | ||
562 | DST2bpp4b(2,4); | ||
563 | |||
564 | RGB(3); | ||
565 | DST2bpp4b(3,6); | ||
566 | DST1bpp4b(3,6); | ||
567 | EPILOG(8) | ||
568 | |||
569 | PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t) | ||
570 | const uint8_t *d128=dither_8x8_220[y&7]; | ||
571 | char out_1=0, out_2=0; | ||
572 | g= c->table_gU[128] + c->table_gV[128]; | ||
573 | |||
574 | #define DST1bpp1(i,o) \ | ||
575 | Y = py_1[2*i]; \ | ||
576 | out_1+= out_1 + g[Y+d128[0+o]]; \ | ||
577 | Y = py_1[2*i+1]; \ | ||
578 | out_1+= out_1 + g[Y+d128[1+o]]; | ||
579 | |||
580 | #define DST2bpp1(i,o) \ | ||
581 | Y = py_2[2*i]; \ | ||
582 | out_2+= out_2 + g[Y+d128[8+o]]; \ | ||
583 | Y = py_2[2*i+1]; \ | ||
584 | out_2+= out_2 + g[Y+d128[9+o]]; | ||
585 | |||
586 | DST1bpp1(0,0); | ||
587 | DST2bpp1(0,0); | ||
588 | |||
589 | DST2bpp1(1,2); | ||
590 | DST1bpp1(1,2); | ||
591 | |||
592 | DST1bpp1(2,4); | ||
593 | DST2bpp1(2,4); | ||
594 | |||
595 | DST2bpp1(3,6); | ||
596 | DST1bpp1(3,6); | ||
597 | |||
598 | dst_1[0]= out_1; | ||
599 | dst_2[0]= out_2; | ||
600 | EPILOG(1) | ||
601 | |||
602 | SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) | ||
603 | { | ||
604 | #if defined(HAVE_MMX2) || defined(HAVE_MMX) | ||
605 | if (c->flags & SWS_CPU_CAPS_MMX2){ | ||
606 | switch(c->dstFormat){ | ||
607 | case PIX_FMT_RGB32: return yuv420_rgb32_MMX2; | ||
608 | case PIX_FMT_BGR24: return yuv420_rgb24_MMX2; | ||
609 | case PIX_FMT_BGR565: return yuv420_rgb16_MMX2; | ||
610 | case PIX_FMT_BGR555: return yuv420_rgb15_MMX2; | ||
611 | } | ||
612 | } | ||
613 | if (c->flags & SWS_CPU_CAPS_MMX){ | ||
614 | switch(c->dstFormat){ | ||
615 | case PIX_FMT_RGB32: return yuv420_rgb32_MMX; | ||
616 | case PIX_FMT_BGR24: return yuv420_rgb24_MMX; | ||
617 | case PIX_FMT_BGR565: return yuv420_rgb16_MMX; | ||
618 | case PIX_FMT_BGR555: return yuv420_rgb15_MMX; | ||
619 | } | ||
620 | } | ||
621 | #endif | ||
622 | #ifdef HAVE_VIS | ||
623 | { | ||
624 | SwsFunc t= yuv2rgb_init_vis(c); | ||
625 | if (t) return t; | ||
626 | } | ||
627 | #endif | ||
628 | #ifdef CONFIG_MLIB | ||
629 | { | ||
630 | SwsFunc t= yuv2rgb_init_mlib(c); | ||
631 | if (t) return t; | ||
632 | } | ||
633 | #endif | ||
634 | #ifdef HAVE_ALTIVEC | ||
635 | if (c->flags & SWS_CPU_CAPS_ALTIVEC) | ||
636 | { | ||
637 | SwsFunc t = yuv2rgb_init_altivec(c); | ||
638 | if (t) return t; | ||
639 | } | ||
640 | #endif | ||
641 | |||
642 | #ifdef ARCH_BFIN | ||
643 | if (c->flags & SWS_CPU_CAPS_BFIN) | ||
644 | { | ||
645 | SwsFunc t = ff_bfin_yuv2rgb_get_func_ptr (c); | ||
646 | if (t) return t; | ||
647 | } | ||
648 | #endif | ||
649 | |||
650 | av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n"); | ||
651 | |||
652 | switch(c->dstFormat){ | ||
653 | case PIX_FMT_BGR32: | ||
654 | case PIX_FMT_RGB32: return yuv2rgb_c_32; | ||
655 | case PIX_FMT_RGB24: return yuv2rgb_c_24_rgb; | ||
656 | case PIX_FMT_BGR24: return yuv2rgb_c_24_bgr; | ||
657 | case PIX_FMT_RGB565: | ||
658 | case PIX_FMT_BGR565: | ||
659 | case PIX_FMT_RGB555: | ||
660 | case PIX_FMT_BGR555: return yuv2rgb_c_16; | ||
661 | case PIX_FMT_RGB8: | ||
662 | case PIX_FMT_BGR8: return yuv2rgb_c_8_ordered_dither; | ||
663 | case PIX_FMT_RGB4: | ||
664 | case PIX_FMT_BGR4: return yuv2rgb_c_4_ordered_dither; | ||
665 | case PIX_FMT_RGB4_BYTE: | ||
666 | case PIX_FMT_BGR4_BYTE: return yuv2rgb_c_4b_ordered_dither; | ||
667 | case PIX_FMT_MONOBLACK: return yuv2rgb_c_1_ordered_dither; | ||
668 | default: | ||
669 | assert(0); | ||
670 | } | ||
671 | return NULL; | ||
672 | } | ||
673 | |||
674 | static int div_round (int dividend, int divisor) | ||
675 | { | ||
676 | if (dividend > 0) | ||
677 | return (dividend + (divisor>>1)) / divisor; | ||
678 | else | ||
679 | return -((-dividend + (divisor>>1)) / divisor); | ||
680 | } | ||
681 | |||
682 | int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation) | ||
683 | { | ||
684 | const int isRgb = isBGR(c->dstFormat); | ||
685 | const int bpp = fmt_depth(c->dstFormat); | ||
686 | int i; | ||
687 | uint8_t table_Y[1024]; | ||
688 | uint32_t *table_32 = 0; | ||
689 | uint16_t *table_16 = 0; | ||
690 | uint8_t *table_8 = 0; | ||
691 | uint8_t *table_332 = 0; | ||
692 | uint8_t *table_121 = 0; | ||
693 | uint8_t *table_1 = 0; | ||
694 | int entry_size = 0; | ||
695 | void *table_r = 0, *table_g = 0, *table_b = 0; | ||
696 | void *table_start; | ||
697 | |||
698 | int64_t crv = inv_table[0]; | ||
699 | int64_t cbu = inv_table[1]; | ||
700 | int64_t cgu = -inv_table[2]; | ||
701 | int64_t cgv = -inv_table[3]; | ||
702 | int64_t cy = 1<<16; | ||
703 | int64_t oy = 0; | ||
704 | |||
705 | //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); | ||
706 | if (!fullRange){ | ||
707 | cy= (cy*255) / 219; | ||
708 | oy= 16<<16; | ||
709 | }else{ | ||
710 | crv= (crv*224) / 255; | ||
711 | cbu= (cbu*224) / 255; | ||
712 | cgu= (cgu*224) / 255; | ||
713 | cgv= (cgv*224) / 255; | ||
714 | } | ||
715 | |||
716 | cy = (cy *contrast )>>16; | ||
717 | crv= (crv*contrast * saturation)>>32; | ||
718 | cbu= (cbu*contrast * saturation)>>32; | ||
719 | cgu= (cgu*contrast * saturation)>>32; | ||
720 | cgv= (cgv*contrast * saturation)>>32; | ||
721 | //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); | ||
722 | oy -= 256*brightness; | ||
723 | |||
724 | for (i = 0; i < 1024; i++) { | ||
725 | int j; | ||
726 | |||
727 | j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32; | ||
728 | j = (j < 0) ? 0 : ((j > 255) ? 255 : j); | ||
729 | table_Y[i] = j; | ||
730 | } | ||
731 | |||
732 | switch (bpp) { | ||
733 | case 32: | ||
734 | table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t)); | ||
735 | |||
736 | entry_size = sizeof (uint32_t); | ||
737 | table_r = table_32 + 197; | ||
738 | table_b = table_32 + 197 + 685; | ||
739 | table_g = table_32 + 197 + 2*682; | ||
740 | |||
741 | for (i = -197; i < 256+197; i++) | ||
742 | ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0); | ||
743 | for (i = -132; i < 256+132; i++) | ||
744 | ((uint32_t *)table_g)[i] = table_Y[i+384] << 8; | ||
745 | for (i = -232; i < 256+232; i++) | ||
746 | ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16); | ||
747 | break; | ||
748 | |||
749 | case 24: | ||
750 | table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t)); | ||
751 | |||
752 | entry_size = sizeof (uint8_t); | ||
753 | table_r = table_g = table_b = table_8 + 232; | ||
754 | |||
755 | for (i = -232; i < 256+232; i++) | ||
756 | ((uint8_t * )table_b)[i] = table_Y[i+384]; | ||
757 | break; | ||
758 | |||
759 | case 15: | ||
760 | case 16: | ||
761 | table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t)); | ||
762 | |||
763 | entry_size = sizeof (uint16_t); | ||
764 | table_r = table_16 + 197; | ||
765 | table_b = table_16 + 197 + 685; | ||
766 | table_g = table_16 + 197 + 2*682; | ||
767 | |||
768 | for (i = -197; i < 256+197; i++) { | ||
769 | int j = table_Y[i+384] >> 3; | ||
770 | |||
771 | if (isRgb) | ||
772 | j <<= ((bpp==16) ? 11 : 10); | ||
773 | |||
774 | ((uint16_t *)table_r)[i] = j; | ||
775 | } | ||
776 | for (i = -132; i < 256+132; i++) { | ||
777 | int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3); | ||
778 | |||
779 | ((uint16_t *)table_g)[i] = j << 5; | ||
780 | } | ||
781 | for (i = -232; i < 256+232; i++) { | ||
782 | int j = table_Y[i+384] >> 3; | ||
783 | |||
784 | if (!isRgb) | ||
785 | j <<= ((bpp==16) ? 11 : 10); | ||
786 | |||
787 | ((uint16_t *)table_b)[i] = j; | ||
788 | } | ||
789 | break; | ||
790 | |||
791 | case 8: | ||
792 | table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); | ||
793 | |||
794 | entry_size = sizeof (uint8_t); | ||
795 | table_r = table_332 + 197; | ||
796 | table_b = table_332 + 197 + 685; | ||
797 | table_g = table_332 + 197 + 2*682; | ||
798 | |||
799 | for (i = -197; i < 256+197; i++) { | ||
800 | int j = (table_Y[i+384 - 16] + 18)/36; | ||
801 | |||
802 | if (isRgb) | ||
803 | j <<= 5; | ||
804 | |||
805 | ((uint8_t *)table_r)[i] = j; | ||
806 | } | ||
807 | for (i = -132; i < 256+132; i++) { | ||
808 | int j = (table_Y[i+384 - 16] + 18)/36; | ||
809 | |||
810 | if (!isRgb) | ||
811 | j <<= 1; | ||
812 | |||
813 | ((uint8_t *)table_g)[i] = j << 2; | ||
814 | } | ||
815 | for (i = -232; i < 256+232; i++) { | ||
816 | int j = (table_Y[i+384 - 37] + 43)/85; | ||
817 | |||
818 | if (!isRgb) | ||
819 | j <<= 6; | ||
820 | |||
821 | ((uint8_t *)table_b)[i] = j; | ||
822 | } | ||
823 | break; | ||
824 | case 4: | ||
825 | case 4|128: | ||
826 | table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t)); | ||
827 | |||
828 | entry_size = sizeof (uint8_t); | ||
829 | table_r = table_121 + 197; | ||
830 | table_b = table_121 + 197 + 685; | ||
831 | table_g = table_121 + 197 + 2*682; | ||
832 | |||
833 | for (i = -197; i < 256+197; i++) { | ||
834 | int j = table_Y[i+384 - 110] >> 7; | ||
835 | |||
836 | if (isRgb) | ||
837 | j <<= 3; | ||
838 | |||
839 | ((uint8_t *)table_r)[i] = j; | ||
840 | } | ||
841 | for (i = -132; i < 256+132; i++) { | ||
842 | int j = (table_Y[i+384 - 37]+ 43)/85; | ||
843 | |||
844 | ((uint8_t *)table_g)[i] = j << 1; | ||
845 | } | ||
846 | for (i = -232; i < 256+232; i++) { | ||
847 | int j =table_Y[i+384 - 110] >> 7; | ||
848 | |||
849 | if (!isRgb) | ||
850 | j <<= 3; | ||
851 | |||
852 | ((uint8_t *)table_b)[i] = j; | ||
853 | } | ||
854 | break; | ||
855 | |||
856 | case 1: | ||
857 | table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t)); | ||
858 | |||
859 | entry_size = sizeof (uint8_t); | ||
860 | table_g = table_1; | ||
861 | table_r = table_b = NULL; | ||
862 | |||
863 | for (i = 0; i < 256+256; i++) { | ||
864 | int j = table_Y[i + 384 - 110]>>7; | ||
865 | |||
866 | ((uint8_t *)table_g)[i] = j; | ||
867 | } | ||
868 | break; | ||
869 | |||
870 | default: | ||
871 | table_start= NULL; | ||
872 | av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp); | ||
873 | //free mem? | ||
874 | return -1; | ||
875 | } | ||
876 | |||
877 | for (i = 0; i < 256; i++) { | ||
878 | c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309); | ||
879 | c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309); | ||
880 | c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309); | ||
881 | c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309); | ||
882 | } | ||
883 | |||
884 | av_free(c->yuvTable); | ||
885 | c->yuvTable= table_start; | ||
886 | return 0; | ||
887 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c deleted file mode 100644 index 13b18d1..0000000 --- a/src/plugins/ffmpeg/libswscale/yuv2rgb_altivec.c +++ /dev/null | |||
@@ -1,965 +0,0 @@ | |||
1 | /* | ||
2 | * AltiVec acceleration for colorspace conversion | ||
3 | * | ||
4 | * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com> | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | Convert I420 YV12 to RGB in various formats, | ||
25 | it rejects images that are not in 420 formats, | ||
26 | it rejects images that don't have widths of multiples of 16, | ||
27 | it rejects images that don't have heights of multiples of 2. | ||
28 | Reject defers to C simulation code. | ||
29 | |||
30 | Lots of optimizations to be done here. | ||
31 | |||
32 | 1. Need to fix saturation code. I just couldn't get it to fly with packs | ||
33 | and adds, so we currently use max/min to clip. | ||
34 | |||
35 | 2. The inefficient use of chroma loading needs a bit of brushing up. | ||
36 | |||
37 | 3. Analysis of pipeline stalls needs to be done. Use shark to identify | ||
38 | pipeline stalls. | ||
39 | |||
40 | |||
41 | MODIFIED to calculate coeffs from currently selected color space. | ||
42 | MODIFIED core to be a macro where you specify the output format. | ||
43 | ADDED UYVY conversion which is never called due to some thing in swscale. | ||
44 | CORRECTED algorithim selection to be strict on input formats. | ||
45 | ADDED runtime detection of AltiVec. | ||
46 | |||
47 | ADDED altivec_yuv2packedX vertical scl + RGB converter | ||
48 | |||
49 | March 27,2004 | ||
50 | PERFORMANCE ANALYSIS | ||
51 | |||
52 | The C version uses 25% of the processor or ~250Mips for D1 video rawvideo | ||
53 | used as test. | ||
54 | The AltiVec version uses 10% of the processor or ~100Mips for D1 video | ||
55 | same sequence. | ||
56 | |||
57 | 720 * 480 * 30 ~10MPS | ||
58 | |||
59 | so we have roughly 10 clocks per pixel. This is too high, something has | ||
60 | to be wrong. | ||
61 | |||
62 | OPTIMIZED clip codes to utilize vec_max and vec_packs removing the | ||
63 | need for vec_min. | ||
64 | |||
65 | OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have | ||
66 | the input video frame, it was just decompressed so it probably resides in L1 | ||
67 | caches. However, we are creating the output video stream. This needs to use the | ||
68 | DSTST instruction to optimize for the cache. We couple this with the fact that | ||
69 | we are not going to be visiting the input buffer again so we mark it Least | ||
70 | Recently Used. This shaves 25% of the processor cycles off. | ||
71 | |||
72 | Now memcpy is the largest mips consumer in the system, probably due | ||
73 | to the inefficient X11 stuff. | ||
74 | |||
75 | GL libraries seem to be very slow on this machine 1.33Ghz PB running | ||
76 | Jaguar, this is not the case for my 1Ghz PB. I thought it might be | ||
77 | a versioning issue, however I have libGL.1.2.dylib for both | ||
78 | machines. (We need to figure this out now.) | ||
79 | |||
80 | GL2 libraries work now with patch for RGB32. | ||
81 | |||
82 | NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor. | ||
83 | |||
84 | Integrated luma prescaling adjustment for saturation/contrast/brightness | ||
85 | adjustment. | ||
86 | */ | ||
87 | |||
88 | #include <stdio.h> | ||
89 | #include <stdlib.h> | ||
90 | #include <string.h> | ||
91 | #include <inttypes.h> | ||
92 | #include <assert.h> | ||
93 | #include "config.h" | ||
94 | #ifdef HAVE_MALLOC_H | ||
95 | #include <malloc.h> | ||
96 | #endif | ||
97 | #include "rgb2rgb.h" | ||
98 | #include "swscale.h" | ||
99 | #include "swscale_internal.h" | ||
100 | |||
101 | #undef PROFILE_THE_BEAST | ||
102 | #undef INC_SCALING | ||
103 | |||
104 | typedef unsigned char ubyte; | ||
105 | typedef signed char sbyte; | ||
106 | |||
107 | |||
108 | /* RGB interleaver, 16 planar pels 8-bit samples per channel in | ||
109 | homogeneous vector registers x0,x1,x2 are interleaved with the | ||
110 | following technique: | ||
111 | |||
112 | o0 = vec_mergeh (x0,x1); | ||
113 | o1 = vec_perm (o0, x2, perm_rgb_0); | ||
114 | o2 = vec_perm (o0, x2, perm_rgb_1); | ||
115 | o3 = vec_mergel (x0,x1); | ||
116 | o4 = vec_perm (o3,o2,perm_rgb_2); | ||
117 | o5 = vec_perm (o3,o2,perm_rgb_3); | ||
118 | |||
119 | perm_rgb_0: o0(RG).h v1(B) --> o1* | ||
120 | 0 1 2 3 4 | ||
121 | rgbr|gbrg|brgb|rgbr | ||
122 | 0010 0100 1001 0010 | ||
123 | 0102 3145 2673 894A | ||
124 | |||
125 | perm_rgb_1: o0(RG).h v1(B) --> o2 | ||
126 | 0 1 2 3 4 | ||
127 | gbrg|brgb|bbbb|bbbb | ||
128 | 0100 1001 1111 1111 | ||
129 | B5CD 6EF7 89AB CDEF | ||
130 | |||
131 | perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4* | ||
132 | 0 1 2 3 4 | ||
133 | gbrg|brgb|rgbr|gbrg | ||
134 | 1111 1111 0010 0100 | ||
135 | 89AB CDEF 0182 3945 | ||
136 | |||
137 | perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5* | ||
138 | 0 1 2 3 4 | ||
139 | brgb|rgbr|gbrg|brgb | ||
140 | 1001 0010 0100 1001 | ||
141 | a67b 89cA BdCD eEFf | ||
142 | |||
143 | */ | ||
144 | static | ||
145 | const vector unsigned char | ||
146 | perm_rgb_0 = AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, | ||
147 | 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), | ||
148 | perm_rgb_1 = AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, | ||
149 | 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), | ||
150 | perm_rgb_2 = AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, | ||
151 | 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), | ||
152 | perm_rgb_3 = AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, | ||
153 | 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); | ||
154 | |||
155 | #define vec_merge3(x2,x1,x0,y0,y1,y2) \ | ||
156 | do { \ | ||
157 | typeof(x0) o0,o2,o3; \ | ||
158 | o0 = vec_mergeh (x0,x1); \ | ||
159 | y0 = vec_perm (o0, x2, perm_rgb_0); \ | ||
160 | o2 = vec_perm (o0, x2, perm_rgb_1); \ | ||
161 | o3 = vec_mergel (x0,x1); \ | ||
162 | y1 = vec_perm (o3,o2,perm_rgb_2); \ | ||
163 | y2 = vec_perm (o3,o2,perm_rgb_3); \ | ||
164 | } while(0) | ||
165 | |||
166 | #define vec_mstbgr24(x0,x1,x2,ptr) \ | ||
167 | do { \ | ||
168 | typeof(x0) _0,_1,_2; \ | ||
169 | vec_merge3 (x0,x1,x2,_0,_1,_2); \ | ||
170 | vec_st (_0, 0, ptr++); \ | ||
171 | vec_st (_1, 0, ptr++); \ | ||
172 | vec_st (_2, 0, ptr++); \ | ||
173 | } while (0); | ||
174 | |||
175 | #define vec_mstrgb24(x0,x1,x2,ptr) \ | ||
176 | do { \ | ||
177 | typeof(x0) _0,_1,_2; \ | ||
178 | vec_merge3 (x2,x1,x0,_0,_1,_2); \ | ||
179 | vec_st (_0, 0, ptr++); \ | ||
180 | vec_st (_1, 0, ptr++); \ | ||
181 | vec_st (_2, 0, ptr++); \ | ||
182 | } while (0); | ||
183 | |||
184 | /* pack the pixels in rgb0 format | ||
185 | msb R | ||
186 | lsb 0 | ||
187 | */ | ||
188 | #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \ | ||
189 | do { \ | ||
190 | T _0,_1,_2,_3; \ | ||
191 | _0 = vec_mergeh (x0,x1); \ | ||
192 | _1 = vec_mergeh (x2,x3); \ | ||
193 | _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | ||
194 | _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | ||
195 | vec_st (_2, 0*16, (T *)ptr); \ | ||
196 | vec_st (_3, 1*16, (T *)ptr); \ | ||
197 | _0 = vec_mergel (x0,x1); \ | ||
198 | _1 = vec_mergel (x2,x3); \ | ||
199 | _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ | ||
200 | _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ | ||
201 | vec_st (_2, 2*16, (T *)ptr); \ | ||
202 | vec_st (_3, 3*16, (T *)ptr); \ | ||
203 | ptr += 4; \ | ||
204 | } while (0); | ||
205 | |||
206 | /* | ||
207 | |||
208 | | 1 0 1.4021 | | Y | | ||
209 | | 1 -0.3441 -0.7142 |x| Cb| | ||
210 | | 1 1.7718 0 | | Cr| | ||
211 | |||
212 | |||
213 | Y: [-128 127] | ||
214 | Cb/Cr : [-128 127] | ||
215 | |||
216 | typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode. | ||
217 | |||
218 | */ | ||
219 | |||
220 | |||
221 | |||
222 | |||
223 | #define vec_unh(x) \ | ||
224 | (vector signed short) \ | ||
225 | vec_perm(x,(typeof(x))AVV(0),\ | ||
226 | (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ | ||
227 | 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) | ||
228 | #define vec_unl(x) \ | ||
229 | (vector signed short) \ | ||
230 | vec_perm(x,(typeof(x))AVV(0),\ | ||
231 | (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ | ||
232 | 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) | ||
233 | |||
234 | #define vec_clip_s16(x) \ | ||
235 | vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\ | ||
236 | (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16)) | ||
237 | |||
238 | #define vec_packclp(x,y) \ | ||
239 | (vector unsigned char)vec_packs \ | ||
240 | ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ | ||
241 | (vector unsigned short)vec_max (y,(vector signed short) AVV(0))) | ||
242 | |||
243 | //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr) | ||
244 | |||
245 | |||
246 | static inline void cvtyuvtoRGB (SwsContext *c, | ||
247 | vector signed short Y, vector signed short U, vector signed short V, | ||
248 | vector signed short *R, vector signed short *G, vector signed short *B) | ||
249 | { | ||
250 | vector signed short vx,ux,uvx; | ||
251 | |||
252 | Y = vec_mradds (Y, c->CY, c->OY); | ||
253 | U = vec_sub (U,(vector signed short) | ||
254 | vec_splat((vector signed short)AVV(128),0)); | ||
255 | V = vec_sub (V,(vector signed short) | ||
256 | vec_splat((vector signed short)AVV(128),0)); | ||
257 | |||
258 | // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15; | ||
259 | ux = vec_sl (U, c->CSHIFT); | ||
260 | *B = vec_mradds (ux, c->CBU, Y); | ||
261 | |||
262 | // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15; | ||
263 | vx = vec_sl (V, c->CSHIFT); | ||
264 | *R = vec_mradds (vx, c->CRV, Y); | ||
265 | |||
266 | // uvx = ((CGU*u) + (CGV*v))>>15; | ||
267 | uvx = vec_mradds (U, c->CGU, Y); | ||
268 | *G = vec_mradds (V, c->CGV, uvx); | ||
269 | } | ||
270 | |||
271 | |||
272 | /* | ||
273 | ------------------------------------------------------------------------------ | ||
274 | CS converters | ||
275 | ------------------------------------------------------------------------------ | ||
276 | */ | ||
277 | |||
278 | |||
279 | #define DEFCSP420_CVT(name,out_pixels) \ | ||
280 | static int altivec_##name (SwsContext *c, \ | ||
281 | unsigned char **in, int *instrides, \ | ||
282 | int srcSliceY, int srcSliceH, \ | ||
283 | unsigned char **oplanes, int *outstrides) \ | ||
284 | { \ | ||
285 | int w = c->srcW; \ | ||
286 | int h = srcSliceH; \ | ||
287 | int i,j; \ | ||
288 | int instrides_scl[3]; \ | ||
289 | vector unsigned char y0,y1; \ | ||
290 | \ | ||
291 | vector signed char u,v; \ | ||
292 | \ | ||
293 | vector signed short Y0,Y1,Y2,Y3; \ | ||
294 | vector signed short U,V; \ | ||
295 | vector signed short vx,ux,uvx; \ | ||
296 | vector signed short vx0,ux0,uvx0; \ | ||
297 | vector signed short vx1,ux1,uvx1; \ | ||
298 | vector signed short R0,G0,B0; \ | ||
299 | vector signed short R1,G1,B1; \ | ||
300 | vector unsigned char R,G,B; \ | ||
301 | \ | ||
302 | vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \ | ||
303 | vector unsigned char align_perm; \ | ||
304 | \ | ||
305 | vector signed short \ | ||
306 | lCY = c->CY, \ | ||
307 | lOY = c->OY, \ | ||
308 | lCRV = c->CRV, \ | ||
309 | lCBU = c->CBU, \ | ||
310 | lCGU = c->CGU, \ | ||
311 | lCGV = c->CGV; \ | ||
312 | \ | ||
313 | vector unsigned short lCSHIFT = c->CSHIFT; \ | ||
314 | \ | ||
315 | ubyte *y1i = in[0]; \ | ||
316 | ubyte *y2i = in[0]+instrides[0]; \ | ||
317 | ubyte *ui = in[1]; \ | ||
318 | ubyte *vi = in[2]; \ | ||
319 | \ | ||
320 | vector unsigned char *oute \ | ||
321 | = (vector unsigned char *) \ | ||
322 | (oplanes[0]+srcSliceY*outstrides[0]); \ | ||
323 | vector unsigned char *outo \ | ||
324 | = (vector unsigned char *) \ | ||
325 | (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ | ||
326 | \ | ||
327 | \ | ||
328 | instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \ | ||
329 | instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \ | ||
330 | instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \ | ||
331 | \ | ||
332 | \ | ||
333 | for (i=0;i<h/2;i++) { \ | ||
334 | vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \ | ||
335 | vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \ | ||
336 | \ | ||
337 | for (j=0;j<w/16;j++) { \ | ||
338 | \ | ||
339 | y1ivP = (vector unsigned char *)y1i; \ | ||
340 | y2ivP = (vector unsigned char *)y2i; \ | ||
341 | uivP = (vector unsigned char *)ui; \ | ||
342 | vivP = (vector unsigned char *)vi; \ | ||
343 | \ | ||
344 | align_perm = vec_lvsl (0, y1i); \ | ||
345 | y0 = (vector unsigned char) \ | ||
346 | vec_perm (y1ivP[0], y1ivP[1], align_perm); \ | ||
347 | \ | ||
348 | align_perm = vec_lvsl (0, y2i); \ | ||
349 | y1 = (vector unsigned char) \ | ||
350 | vec_perm (y2ivP[0], y2ivP[1], align_perm); \ | ||
351 | \ | ||
352 | align_perm = vec_lvsl (0, ui); \ | ||
353 | u = (vector signed char) \ | ||
354 | vec_perm (uivP[0], uivP[1], align_perm); \ | ||
355 | \ | ||
356 | align_perm = vec_lvsl (0, vi); \ | ||
357 | v = (vector signed char) \ | ||
358 | vec_perm (vivP[0], vivP[1], align_perm); \ | ||
359 | \ | ||
360 | u = (vector signed char) \ | ||
361 | vec_sub (u,(vector signed char) \ | ||
362 | vec_splat((vector signed char)AVV(128),0)); \ | ||
363 | v = (vector signed char) \ | ||
364 | vec_sub (v,(vector signed char) \ | ||
365 | vec_splat((vector signed char)AVV(128),0)); \ | ||
366 | \ | ||
367 | U = vec_unpackh (u); \ | ||
368 | V = vec_unpackh (v); \ | ||
369 | \ | ||
370 | \ | ||
371 | Y0 = vec_unh (y0); \ | ||
372 | Y1 = vec_unl (y0); \ | ||
373 | Y2 = vec_unh (y1); \ | ||
374 | Y3 = vec_unl (y1); \ | ||
375 | \ | ||
376 | Y0 = vec_mradds (Y0, lCY, lOY); \ | ||
377 | Y1 = vec_mradds (Y1, lCY, lOY); \ | ||
378 | Y2 = vec_mradds (Y2, lCY, lOY); \ | ||
379 | Y3 = vec_mradds (Y3, lCY, lOY); \ | ||
380 | \ | ||
381 | /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \ | ||
382 | ux = vec_sl (U, lCSHIFT); \ | ||
383 | ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ | ||
384 | ux0 = vec_mergeh (ux,ux); \ | ||
385 | ux1 = vec_mergel (ux,ux); \ | ||
386 | \ | ||
387 | /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \ | ||
388 | vx = vec_sl (V, lCSHIFT); \ | ||
389 | vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ | ||
390 | vx0 = vec_mergeh (vx,vx); \ | ||
391 | vx1 = vec_mergel (vx,vx); \ | ||
392 | \ | ||
393 | /* uvx = ((CGU*u) + (CGV*v))>>15 */ \ | ||
394 | uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ | ||
395 | uvx = vec_mradds (V, lCGV, uvx); \ | ||
396 | uvx0 = vec_mergeh (uvx,uvx); \ | ||
397 | uvx1 = vec_mergel (uvx,uvx); \ | ||
398 | \ | ||
399 | R0 = vec_add (Y0,vx0); \ | ||
400 | G0 = vec_add (Y0,uvx0); \ | ||
401 | B0 = vec_add (Y0,ux0); \ | ||
402 | R1 = vec_add (Y1,vx1); \ | ||
403 | G1 = vec_add (Y1,uvx1); \ | ||
404 | B1 = vec_add (Y1,ux1); \ | ||
405 | \ | ||
406 | R = vec_packclp (R0,R1); \ | ||
407 | G = vec_packclp (G0,G1); \ | ||
408 | B = vec_packclp (B0,B1); \ | ||
409 | \ | ||
410 | out_pixels(R,G,B,oute); \ | ||
411 | \ | ||
412 | R0 = vec_add (Y2,vx0); \ | ||
413 | G0 = vec_add (Y2,uvx0); \ | ||
414 | B0 = vec_add (Y2,ux0); \ | ||
415 | R1 = vec_add (Y3,vx1); \ | ||
416 | G1 = vec_add (Y3,uvx1); \ | ||
417 | B1 = vec_add (Y3,ux1); \ | ||
418 | R = vec_packclp (R0,R1); \ | ||
419 | G = vec_packclp (G0,G1); \ | ||
420 | B = vec_packclp (B0,B1); \ | ||
421 | \ | ||
422 | \ | ||
423 | out_pixels(R,G,B,outo); \ | ||
424 | \ | ||
425 | y1i += 16; \ | ||
426 | y2i += 16; \ | ||
427 | ui += 8; \ | ||
428 | vi += 8; \ | ||
429 | \ | ||
430 | } \ | ||
431 | \ | ||
432 | outo += (outstrides[0])>>4; \ | ||
433 | oute += (outstrides[0])>>4; \ | ||
434 | \ | ||
435 | ui += instrides_scl[1]; \ | ||
436 | vi += instrides_scl[2]; \ | ||
437 | y1i += instrides_scl[0]; \ | ||
438 | y2i += instrides_scl[0]; \ | ||
439 | } \ | ||
440 | return srcSliceH; \ | ||
441 | } | ||
442 | |||
443 | |||
444 | #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr) | ||
445 | #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr) | ||
446 | #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr) | ||
447 | #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr) | ||
448 | #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr) | ||
449 | #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr) | ||
450 | |||
451 | DEFCSP420_CVT (yuv2_abgr, out_abgr) | ||
452 | #if 1 | ||
453 | DEFCSP420_CVT (yuv2_bgra, out_bgra) | ||
454 | #else | ||
455 | static int altivec_yuv2_bgra32 (SwsContext *c, | ||
456 | unsigned char **in, int *instrides, | ||
457 | int srcSliceY, int srcSliceH, | ||
458 | unsigned char **oplanes, int *outstrides) | ||
459 | { | ||
460 | int w = c->srcW; | ||
461 | int h = srcSliceH; | ||
462 | int i,j; | ||
463 | int instrides_scl[3]; | ||
464 | vector unsigned char y0,y1; | ||
465 | |||
466 | vector signed char u,v; | ||
467 | |||
468 | vector signed short Y0,Y1,Y2,Y3; | ||
469 | vector signed short U,V; | ||
470 | vector signed short vx,ux,uvx; | ||
471 | vector signed short vx0,ux0,uvx0; | ||
472 | vector signed short vx1,ux1,uvx1; | ||
473 | vector signed short R0,G0,B0; | ||
474 | vector signed short R1,G1,B1; | ||
475 | vector unsigned char R,G,B; | ||
476 | |||
477 | vector unsigned char *uivP, *vivP; | ||
478 | vector unsigned char align_perm; | ||
479 | |||
480 | vector signed short | ||
481 | lCY = c->CY, | ||
482 | lOY = c->OY, | ||
483 | lCRV = c->CRV, | ||
484 | lCBU = c->CBU, | ||
485 | lCGU = c->CGU, | ||
486 | lCGV = c->CGV; | ||
487 | |||
488 | vector unsigned short lCSHIFT = c->CSHIFT; | ||
489 | |||
490 | ubyte *y1i = in[0]; | ||
491 | ubyte *y2i = in[0]+w; | ||
492 | ubyte *ui = in[1]; | ||
493 | ubyte *vi = in[2]; | ||
494 | |||
495 | vector unsigned char *oute | ||
496 | = (vector unsigned char *) | ||
497 | (oplanes[0]+srcSliceY*outstrides[0]); | ||
498 | vector unsigned char *outo | ||
499 | = (vector unsigned char *) | ||
500 | (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); | ||
501 | |||
502 | |||
503 | instrides_scl[0] = instrides[0]; | ||
504 | instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ | ||
505 | instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ | ||
506 | |||
507 | |||
508 | for (i=0;i<h/2;i++) { | ||
509 | vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); | ||
510 | vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); | ||
511 | |||
512 | for (j=0;j<w/16;j++) { | ||
513 | |||
514 | y0 = vec_ldl (0,y1i); | ||
515 | y1 = vec_ldl (0,y2i); | ||
516 | uivP = (vector unsigned char *)ui; | ||
517 | vivP = (vector unsigned char *)vi; | ||
518 | |||
519 | align_perm = vec_lvsl (0, ui); | ||
520 | u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); | ||
521 | |||
522 | align_perm = vec_lvsl (0, vi); | ||
523 | v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); | ||
524 | u = (vector signed char) | ||
525 | vec_sub (u,(vector signed char) | ||
526 | vec_splat((vector signed char)AVV(128),0)); | ||
527 | |||
528 | v = (vector signed char) | ||
529 | vec_sub (v, (vector signed char) | ||
530 | vec_splat((vector signed char)AVV(128),0)); | ||
531 | |||
532 | U = vec_unpackh (u); | ||
533 | V = vec_unpackh (v); | ||
534 | |||
535 | |||
536 | Y0 = vec_unh (y0); | ||
537 | Y1 = vec_unl (y0); | ||
538 | Y2 = vec_unh (y1); | ||
539 | Y3 = vec_unl (y1); | ||
540 | |||
541 | Y0 = vec_mradds (Y0, lCY, lOY); | ||
542 | Y1 = vec_mradds (Y1, lCY, lOY); | ||
543 | Y2 = vec_mradds (Y2, lCY, lOY); | ||
544 | Y3 = vec_mradds (Y3, lCY, lOY); | ||
545 | |||
546 | /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ | ||
547 | ux = vec_sl (U, lCSHIFT); | ||
548 | ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); | ||
549 | ux0 = vec_mergeh (ux,ux); | ||
550 | ux1 = vec_mergel (ux,ux); | ||
551 | |||
552 | /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ | ||
553 | vx = vec_sl (V, lCSHIFT); | ||
554 | vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); | ||
555 | vx0 = vec_mergeh (vx,vx); | ||
556 | vx1 = vec_mergel (vx,vx); | ||
557 | /* uvx = ((CGU*u) + (CGV*v))>>15 */ | ||
558 | uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); | ||
559 | uvx = vec_mradds (V, lCGV, uvx); | ||
560 | uvx0 = vec_mergeh (uvx,uvx); | ||
561 | uvx1 = vec_mergel (uvx,uvx); | ||
562 | R0 = vec_add (Y0,vx0); | ||
563 | G0 = vec_add (Y0,uvx0); | ||
564 | B0 = vec_add (Y0,ux0); | ||
565 | R1 = vec_add (Y1,vx1); | ||
566 | G1 = vec_add (Y1,uvx1); | ||
567 | B1 = vec_add (Y1,ux1); | ||
568 | R = vec_packclp (R0,R1); | ||
569 | G = vec_packclp (G0,G1); | ||
570 | B = vec_packclp (B0,B1); | ||
571 | |||
572 | out_argb(R,G,B,oute); | ||
573 | R0 = vec_add (Y2,vx0); | ||
574 | G0 = vec_add (Y2,uvx0); | ||
575 | B0 = vec_add (Y2,ux0); | ||
576 | R1 = vec_add (Y3,vx1); | ||
577 | G1 = vec_add (Y3,uvx1); | ||
578 | B1 = vec_add (Y3,ux1); | ||
579 | R = vec_packclp (R0,R1); | ||
580 | G = vec_packclp (G0,G1); | ||
581 | B = vec_packclp (B0,B1); | ||
582 | |||
583 | out_argb(R,G,B,outo); | ||
584 | y1i += 16; | ||
585 | y2i += 16; | ||
586 | ui += 8; | ||
587 | vi += 8; | ||
588 | |||
589 | } | ||
590 | |||
591 | outo += (outstrides[0])>>4; | ||
592 | oute += (outstrides[0])>>4; | ||
593 | |||
594 | ui += instrides_scl[1]; | ||
595 | vi += instrides_scl[2]; | ||
596 | y1i += instrides_scl[0]; | ||
597 | y2i += instrides_scl[0]; | ||
598 | } | ||
599 | return srcSliceH; | ||
600 | } | ||
601 | |||
602 | #endif | ||
603 | |||
604 | |||
605 | DEFCSP420_CVT (yuv2_rgba, out_rgba) | ||
606 | DEFCSP420_CVT (yuv2_argb, out_argb) | ||
607 | DEFCSP420_CVT (yuv2_rgb24, out_rgb24) | ||
608 | DEFCSP420_CVT (yuv2_bgr24, out_bgr24) | ||
609 | |||
610 | |||
611 | // uyvy|uyvy|uyvy|uyvy | ||
612 | // 0123 4567 89ab cdef | ||
613 | static | ||
614 | const vector unsigned char | ||
615 | demux_u = AVV(0x10,0x00,0x10,0x00, | ||
616 | 0x10,0x04,0x10,0x04, | ||
617 | 0x10,0x08,0x10,0x08, | ||
618 | 0x10,0x0c,0x10,0x0c), | ||
619 | demux_v = AVV(0x10,0x02,0x10,0x02, | ||
620 | 0x10,0x06,0x10,0x06, | ||
621 | 0x10,0x0A,0x10,0x0A, | ||
622 | 0x10,0x0E,0x10,0x0E), | ||
623 | demux_y = AVV(0x10,0x01,0x10,0x03, | ||
624 | 0x10,0x05,0x10,0x07, | ||
625 | 0x10,0x09,0x10,0x0B, | ||
626 | 0x10,0x0D,0x10,0x0F); | ||
627 | |||
628 | /* | ||
629 | this is so I can play live CCIR raw video | ||
630 | */ | ||
631 | static int altivec_uyvy_rgb32 (SwsContext *c, | ||
632 | unsigned char **in, int *instrides, | ||
633 | int srcSliceY, int srcSliceH, | ||
634 | unsigned char **oplanes, int *outstrides) | ||
635 | { | ||
636 | int w = c->srcW; | ||
637 | int h = srcSliceH; | ||
638 | int i,j; | ||
639 | vector unsigned char uyvy; | ||
640 | vector signed short Y,U,V; | ||
641 | vector signed short R0,G0,B0,R1,G1,B1; | ||
642 | vector unsigned char R,G,B; | ||
643 | vector unsigned char *out; | ||
644 | ubyte *img; | ||
645 | |||
646 | img = in[0]; | ||
647 | out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); | ||
648 | |||
649 | for (i=0;i<h;i++) { | ||
650 | for (j=0;j<w/16;j++) { | ||
651 | uyvy = vec_ld (0, img); | ||
652 | U = (vector signed short) | ||
653 | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); | ||
654 | |||
655 | V = (vector signed short) | ||
656 | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); | ||
657 | |||
658 | Y = (vector signed short) | ||
659 | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); | ||
660 | |||
661 | cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); | ||
662 | |||
663 | uyvy = vec_ld (16, img); | ||
664 | U = (vector signed short) | ||
665 | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); | ||
666 | |||
667 | V = (vector signed short) | ||
668 | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); | ||
669 | |||
670 | Y = (vector signed short) | ||
671 | vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); | ||
672 | |||
673 | cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); | ||
674 | |||
675 | R = vec_packclp (R0,R1); | ||
676 | G = vec_packclp (G0,G1); | ||
677 | B = vec_packclp (B0,B1); | ||
678 | |||
679 | // vec_mstbgr24 (R,G,B, out); | ||
680 | out_rgba (R,G,B,out); | ||
681 | |||
682 | img += 32; | ||
683 | } | ||
684 | } | ||
685 | return srcSliceH; | ||
686 | } | ||
687 | |||
688 | |||
689 | |||
690 | /* Ok currently the acceleration routine only supports | ||
691 | inputs of widths a multiple of 16 | ||
692 | and heights a multiple 2 | ||
693 | |||
694 | So we just fall back to the C codes for this. | ||
695 | */ | ||
696 | SwsFunc yuv2rgb_init_altivec (SwsContext *c) | ||
697 | { | ||
698 | if (!(c->flags & SWS_CPU_CAPS_ALTIVEC)) | ||
699 | return NULL; | ||
700 | |||
701 | /* | ||
702 | and this seems not to matter too much I tried a bunch of | ||
703 | videos with abnormal widths and MPlayer crashes elsewhere. | ||
704 | mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv | ||
705 | boom with X11 bad match. | ||
706 | |||
707 | */ | ||
708 | if ((c->srcW & 0xf) != 0) return NULL; | ||
709 | |||
710 | switch (c->srcFormat) { | ||
711 | case PIX_FMT_YUV410P: | ||
712 | case PIX_FMT_YUV420P: | ||
713 | /*case IMGFMT_CLPL: ??? */ | ||
714 | case PIX_FMT_GRAY8: | ||
715 | case PIX_FMT_NV12: | ||
716 | case PIX_FMT_NV21: | ||
717 | if ((c->srcH & 0x1) != 0) | ||
718 | return NULL; | ||
719 | |||
720 | switch(c->dstFormat){ | ||
721 | case PIX_FMT_RGB24: | ||
722 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n"); | ||
723 | return altivec_yuv2_rgb24; | ||
724 | case PIX_FMT_BGR24: | ||
725 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n"); | ||
726 | return altivec_yuv2_bgr24; | ||
727 | case PIX_FMT_ARGB: | ||
728 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n"); | ||
729 | return altivec_yuv2_argb; | ||
730 | case PIX_FMT_ABGR: | ||
731 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n"); | ||
732 | return altivec_yuv2_abgr; | ||
733 | case PIX_FMT_RGBA: | ||
734 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n"); | ||
735 | return altivec_yuv2_rgba; | ||
736 | case PIX_FMT_BGRA: | ||
737 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n"); | ||
738 | return altivec_yuv2_bgra; | ||
739 | default: return NULL; | ||
740 | } | ||
741 | break; | ||
742 | |||
743 | case PIX_FMT_UYVY422: | ||
744 | switch(c->dstFormat){ | ||
745 | case PIX_FMT_BGR32: | ||
746 | av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n"); | ||
747 | return altivec_uyvy_rgb32; | ||
748 | default: return NULL; | ||
749 | } | ||
750 | break; | ||
751 | |||
752 | } | ||
753 | return NULL; | ||
754 | } | ||
755 | |||
756 | void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) | ||
757 | { | ||
758 | union { | ||
759 | signed short tmp[8] __attribute__ ((aligned(16))); | ||
760 | vector signed short vec; | ||
761 | } buf; | ||
762 | |||
763 | buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy | ||
764 | buf.tmp[1] = -256*brightness; //oy | ||
765 | buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv | ||
766 | buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu | ||
767 | buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu | ||
768 | buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv | ||
769 | |||
770 | |||
771 | c->CSHIFT = (vector unsigned short)vec_splat_u16(2); | ||
772 | c->CY = vec_splat ((vector signed short)buf.vec, 0); | ||
773 | c->OY = vec_splat ((vector signed short)buf.vec, 1); | ||
774 | c->CRV = vec_splat ((vector signed short)buf.vec, 2); | ||
775 | c->CBU = vec_splat ((vector signed short)buf.vec, 3); | ||
776 | c->CGU = vec_splat ((vector signed short)buf.vec, 4); | ||
777 | c->CGV = vec_splat ((vector signed short)buf.vec, 5); | ||
778 | #if 0 | ||
779 | { | ||
780 | int i; | ||
781 | char *v[6]={"cy","oy","crv","cbu","cgu","cgv"}; | ||
782 | for (i=0; i<6; i++) | ||
783 | printf("%s %d ", v[i],buf.tmp[i] ); | ||
784 | printf("\n"); | ||
785 | } | ||
786 | #endif | ||
787 | return; | ||
788 | } | ||
789 | |||
790 | |||
791 | void | ||
792 | altivec_yuv2packedX (SwsContext *c, | ||
793 | int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize, | ||
794 | int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, | ||
795 | uint8_t *dest, int dstW, int dstY) | ||
796 | { | ||
797 | int i,j; | ||
798 | vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; | ||
799 | vector signed short R0,G0,B0,R1,G1,B1; | ||
800 | |||
801 | vector unsigned char R,G,B; | ||
802 | vector unsigned char *out,*nout; | ||
803 | |||
804 | vector signed short RND = vec_splat_s16(1<<3); | ||
805 | vector unsigned short SCL = vec_splat_u16(4); | ||
806 | unsigned long scratch[16] __attribute__ ((aligned (16))); | ||
807 | |||
808 | vector signed short *YCoeffs, *CCoeffs; | ||
809 | |||
810 | YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize; | ||
811 | CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize; | ||
812 | |||
813 | out = (vector unsigned char *)dest; | ||
814 | |||
815 | for (i=0; i<dstW; i+=16){ | ||
816 | Y0 = RND; | ||
817 | Y1 = RND; | ||
818 | /* extract 16 coeffs from lumSrc */ | ||
819 | for (j=0; j<lumFilterSize; j++) { | ||
820 | X0 = vec_ld (0, &lumSrc[j][i]); | ||
821 | X1 = vec_ld (16, &lumSrc[j][i]); | ||
822 | Y0 = vec_mradds (X0, YCoeffs[j], Y0); | ||
823 | Y1 = vec_mradds (X1, YCoeffs[j], Y1); | ||
824 | } | ||
825 | |||
826 | U = RND; | ||
827 | V = RND; | ||
828 | /* extract 8 coeffs from U,V */ | ||
829 | for (j=0; j<chrFilterSize; j++) { | ||
830 | X = vec_ld (0, &chrSrc[j][i/2]); | ||
831 | U = vec_mradds (X, CCoeffs[j], U); | ||
832 | X = vec_ld (0, &chrSrc[j][i/2+2048]); | ||
833 | V = vec_mradds (X, CCoeffs[j], V); | ||
834 | } | ||
835 | |||
836 | /* scale and clip signals */ | ||
837 | Y0 = vec_sra (Y0, SCL); | ||
838 | Y1 = vec_sra (Y1, SCL); | ||
839 | U = vec_sra (U, SCL); | ||
840 | V = vec_sra (V, SCL); | ||
841 | |||
842 | Y0 = vec_clip_s16 (Y0); | ||
843 | Y1 = vec_clip_s16 (Y1); | ||
844 | U = vec_clip_s16 (U); | ||
845 | V = vec_clip_s16 (V); | ||
846 | |||
847 | /* now we have | ||
848 | Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | ||
849 | U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7 | ||
850 | |||
851 | Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | ||
852 | U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | ||
853 | V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | ||
854 | */ | ||
855 | |||
856 | U0 = vec_mergeh (U,U); | ||
857 | V0 = vec_mergeh (V,V); | ||
858 | |||
859 | U1 = vec_mergel (U,U); | ||
860 | V1 = vec_mergel (V,V); | ||
861 | |||
862 | cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | ||
863 | cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | ||
864 | |||
865 | R = vec_packclp (R0,R1); | ||
866 | G = vec_packclp (G0,G1); | ||
867 | B = vec_packclp (B0,B1); | ||
868 | |||
869 | switch(c->dstFormat) { | ||
870 | case PIX_FMT_ABGR: out_abgr (R,G,B,out); break; | ||
871 | case PIX_FMT_BGRA: out_bgra (R,G,B,out); break; | ||
872 | case PIX_FMT_RGBA: out_rgba (R,G,B,out); break; | ||
873 | case PIX_FMT_ARGB: out_argb (R,G,B,out); break; | ||
874 | case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break; | ||
875 | case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break; | ||
876 | default: | ||
877 | { | ||
878 | /* If this is reached, the caller should have called yuv2packedXinC | ||
879 | instead. */ | ||
880 | static int printed_error_message; | ||
881 | if (!printed_error_message) { | ||
882 | av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", | ||
883 | sws_format_name(c->dstFormat)); | ||
884 | printed_error_message=1; | ||
885 | } | ||
886 | return; | ||
887 | } | ||
888 | } | ||
889 | } | ||
890 | |||
891 | if (i < dstW) { | ||
892 | i -= 16; | ||
893 | |||
894 | Y0 = RND; | ||
895 | Y1 = RND; | ||
896 | /* extract 16 coeffs from lumSrc */ | ||
897 | for (j=0; j<lumFilterSize; j++) { | ||
898 | X0 = vec_ld (0, &lumSrc[j][i]); | ||
899 | X1 = vec_ld (16, &lumSrc[j][i]); | ||
900 | Y0 = vec_mradds (X0, YCoeffs[j], Y0); | ||
901 | Y1 = vec_mradds (X1, YCoeffs[j], Y1); | ||
902 | } | ||
903 | |||
904 | U = RND; | ||
905 | V = RND; | ||
906 | /* extract 8 coeffs from U,V */ | ||
907 | for (j=0; j<chrFilterSize; j++) { | ||
908 | X = vec_ld (0, &chrSrc[j][i/2]); | ||
909 | U = vec_mradds (X, CCoeffs[j], U); | ||
910 | X = vec_ld (0, &chrSrc[j][i/2+2048]); | ||
911 | V = vec_mradds (X, CCoeffs[j], V); | ||
912 | } | ||
913 | |||
914 | /* scale and clip signals */ | ||
915 | Y0 = vec_sra (Y0, SCL); | ||
916 | Y1 = vec_sra (Y1, SCL); | ||
917 | U = vec_sra (U, SCL); | ||
918 | V = vec_sra (V, SCL); | ||
919 | |||
920 | Y0 = vec_clip_s16 (Y0); | ||
921 | Y1 = vec_clip_s16 (Y1); | ||
922 | U = vec_clip_s16 (U); | ||
923 | V = vec_clip_s16 (V); | ||
924 | |||
925 | /* now we have | ||
926 | Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | ||
927 | U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7 | ||
928 | |||
929 | Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15 | ||
930 | U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7 | ||
931 | V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7 | ||
932 | */ | ||
933 | |||
934 | U0 = vec_mergeh (U,U); | ||
935 | V0 = vec_mergeh (V,V); | ||
936 | |||
937 | U1 = vec_mergel (U,U); | ||
938 | V1 = vec_mergel (V,V); | ||
939 | |||
940 | cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); | ||
941 | cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); | ||
942 | |||
943 | R = vec_packclp (R0,R1); | ||
944 | G = vec_packclp (G0,G1); | ||
945 | B = vec_packclp (B0,B1); | ||
946 | |||
947 | nout = (vector unsigned char *)scratch; | ||
948 | switch(c->dstFormat) { | ||
949 | case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break; | ||
950 | case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break; | ||
951 | case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break; | ||
952 | case PIX_FMT_ARGB: out_argb (R,G,B,nout); break; | ||
953 | case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break; | ||
954 | case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break; | ||
955 | default: | ||
956 | /* Unreachable, I think. */ | ||
957 | av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n", | ||
958 | sws_format_name(c->dstFormat)); | ||
959 | return; | ||
960 | } | ||
961 | |||
962 | memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4); | ||
963 | } | ||
964 | |||
965 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c deleted file mode 100644 index 1500a96..0000000 --- a/src/plugins/ffmpeg/libswscale/yuv2rgb_bfin.c +++ /dev/null | |||
@@ -1,206 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> | ||
3 | * | ||
4 | * Blackfin video color space converter operations | ||
5 | * convert I420 YV12 to RGB in various formats | ||
6 | * | ||
7 | * This file is part of FFmpeg. | ||
8 | * | ||
9 | * FFmpeg is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU Lesser General Public | ||
11 | * License as published by the Free Software Foundation; either | ||
12 | * version 2.1 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * FFmpeg is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * Lesser General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU Lesser General Public | ||
20 | * License along with FFmpeg; if not, write to the Free Software | ||
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
22 | */ | ||
23 | |||
24 | #include <stdio.h> | ||
25 | #include <stdlib.h> | ||
26 | #include <string.h> | ||
27 | #include <inttypes.h> | ||
28 | #include <assert.h> | ||
29 | #include "config.h" | ||
30 | #ifdef HAVE_MALLOC_H | ||
31 | #include <malloc.h> | ||
32 | #endif | ||
33 | #include <unistd.h> | ||
34 | #include "rgb2rgb.h" | ||
35 | #include "swscale.h" | ||
36 | #include "swscale_internal.h" | ||
37 | |||
38 | #ifdef __FDPIC__ | ||
39 | #define L1CODE __attribute__ ((l1_text)) | ||
40 | #else | ||
41 | #define L1CODE | ||
42 | #endif | ||
43 | |||
44 | extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, | ||
45 | int w, uint32_t *coeffs) L1CODE; | ||
46 | |||
47 | extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, | ||
48 | int w, uint32_t *coeffs) L1CODE; | ||
49 | |||
50 | extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, | ||
51 | int w, uint32_t *coeffs) L1CODE; | ||
52 | |||
53 | typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out, | ||
54 | int w, uint32_t *coeffs); | ||
55 | |||
56 | |||
57 | static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks) | ||
58 | { | ||
59 | int oy; | ||
60 | oy = c->yOffset&0xffff; | ||
61 | oy = oy >> 3; // keep everything U8.0 for offset calculation | ||
62 | |||
63 | c->oc = 128*0x01010101U; | ||
64 | c->oy = oy*0x01010101U; | ||
65 | |||
66 | /* copy 64bit vector coeffs down to 32bit vector coeffs */ | ||
67 | c->cy = c->yCoeff; | ||
68 | c->zero = 0; | ||
69 | |||
70 | if (rgb) { | ||
71 | c->crv = c->vrCoeff; | ||
72 | c->cbu = c->ubCoeff; | ||
73 | c->cgu = c->ugCoeff; | ||
74 | c->cgv = c->vgCoeff; | ||
75 | } else { | ||
76 | c->crv = c->ubCoeff; | ||
77 | c->cbu = c->vrCoeff; | ||
78 | c->cgu = c->vgCoeff; | ||
79 | c->cgv = c->ugCoeff; | ||
80 | } | ||
81 | |||
82 | |||
83 | if (masks == 555) { | ||
84 | c->rmask = 0x001f * 0x00010001U; | ||
85 | c->gmask = 0x03e0 * 0x00010001U; | ||
86 | c->bmask = 0x7c00 * 0x00010001U; | ||
87 | } else if (masks == 565) { | ||
88 | c->rmask = 0x001f * 0x00010001U; | ||
89 | c->gmask = 0x07e0 * 0x00010001U; | ||
90 | c->bmask = 0xf800 * 0x00010001U; | ||
91 | } | ||
92 | } | ||
93 | |||
94 | static int core_yuv420_rgb (SwsContext *c, | ||
95 | uint8_t **in, int *instrides, | ||
96 | int srcSliceY, int srcSliceH, | ||
97 | uint8_t **oplanes, int *outstrides, | ||
98 | ltransform_t lcscf, int rgb, int masks) | ||
99 | { | ||
100 | uint8_t *py,*pu,*pv,*op; | ||
101 | int w = instrides[0]; | ||
102 | int h2 = srcSliceH>>1; | ||
103 | int i; | ||
104 | |||
105 | bfin_prepare_coefficients (c, rgb, masks); | ||
106 | |||
107 | py = in[0]; | ||
108 | pu = in[1+(1^rgb)]; | ||
109 | pv = in[1+(0^rgb)]; | ||
110 | |||
111 | op = oplanes[0] + srcSliceY*outstrides[0]; | ||
112 | |||
113 | for (i=0;i<h2;i++) { | ||
114 | |||
115 | lcscf (py, pu, pv, op, w, &c->oy); | ||
116 | |||
117 | py += instrides[0]; | ||
118 | op += outstrides[0]; | ||
119 | |||
120 | lcscf (py, pu, pv, op, w, &c->oy); | ||
121 | |||
122 | py += instrides[0]; | ||
123 | pu += instrides[1]; | ||
124 | pv += instrides[2]; | ||
125 | op += outstrides[0]; | ||
126 | } | ||
127 | |||
128 | return srcSliceH; | ||
129 | } | ||
130 | |||
131 | |||
132 | static int bfin_yuv420_rgb555 (SwsContext *c, | ||
133 | uint8_t **in, int *instrides, | ||
134 | int srcSliceY, int srcSliceH, | ||
135 | uint8_t **oplanes, int *outstrides) | ||
136 | { | ||
137 | return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides, | ||
138 | ff_bfin_yuv2rgb555_line, 1, 555); | ||
139 | } | ||
140 | |||
141 | static int bfin_yuv420_bgr555 (SwsContext *c, | ||
142 | uint8_t **in, int *instrides, | ||
143 | int srcSliceY, int srcSliceH, | ||
144 | uint8_t **oplanes, int *outstrides) | ||
145 | { | ||
146 | return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides, | ||
147 | ff_bfin_yuv2rgb555_line, 0, 555); | ||
148 | } | ||
149 | |||
150 | static int bfin_yuv420_rgb24 (SwsContext *c, | ||
151 | uint8_t **in, int *instrides, | ||
152 | int srcSliceY, int srcSliceH, | ||
153 | uint8_t **oplanes, int *outstrides) | ||
154 | { | ||
155 | return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides, | ||
156 | ff_bfin_yuv2rgb24_line, 1, 888); | ||
157 | } | ||
158 | |||
159 | static int bfin_yuv420_bgr24 (SwsContext *c, | ||
160 | uint8_t **in, int *instrides, | ||
161 | int srcSliceY, int srcSliceH, | ||
162 | uint8_t **oplanes, int *outstrides) | ||
163 | { | ||
164 | return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides, | ||
165 | ff_bfin_yuv2rgb24_line, 0, 888); | ||
166 | } | ||
167 | |||
168 | static int bfin_yuv420_rgb565 (SwsContext *c, | ||
169 | uint8_t **in, int *instrides, | ||
170 | int srcSliceY, int srcSliceH, | ||
171 | uint8_t **oplanes, int *outstrides) | ||
172 | { | ||
173 | return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides, | ||
174 | ff_bfin_yuv2rgb565_line, 1, 565); | ||
175 | } | ||
176 | |||
177 | static int bfin_yuv420_bgr565 (SwsContext *c, | ||
178 | uint8_t **in, int *instrides, | ||
179 | int srcSliceY, int srcSliceH, | ||
180 | uint8_t **oplanes, int *outstrides) | ||
181 | { | ||
182 | return core_yuv420_rgb (c, in, instrides, srcSliceY, srcSliceH, oplanes, outstrides, | ||
183 | ff_bfin_yuv2rgb565_line, 0, 565); | ||
184 | } | ||
185 | |||
186 | |||
187 | SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c) | ||
188 | { | ||
189 | SwsFunc f; | ||
190 | |||
191 | switch(c->dstFormat) { | ||
192 | case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break; | ||
193 | case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break; | ||
194 | case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break; | ||
195 | case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break; | ||
196 | case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break; | ||
197 | case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break; | ||
198 | default: | ||
199 | return 0; | ||
200 | } | ||
201 | |||
202 | av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n", | ||
203 | sws_format_name (c->dstFormat)); | ||
204 | |||
205 | return f; | ||
206 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c deleted file mode 100644 index ff2e50a..0000000 --- a/src/plugins/ffmpeg/libswscale/yuv2rgb_mlib.c +++ /dev/null | |||
@@ -1,85 +0,0 @@ | |||
1 | /* | ||
2 | * software YUV to RGB converter using mediaLib | ||
3 | * | ||
4 | * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> | ||
5 | * | ||
6 | * This file is part of FFmpeg. | ||
7 | * | ||
8 | * FFmpeg is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU Lesser General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2.1 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * FFmpeg is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * Lesser General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU Lesser General Public | ||
19 | * License along with FFmpeg; if not, write to the Free Software | ||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
21 | */ | ||
22 | |||
23 | #include <mlib_types.h> | ||
24 | #include <mlib_status.h> | ||
25 | #include <mlib_sys.h> | ||
26 | #include <mlib_video.h> | ||
27 | #include <inttypes.h> | ||
28 | #include <stdlib.h> | ||
29 | #include <assert.h> | ||
30 | |||
31 | #include "swscale.h" | ||
32 | |||
33 | static int mlib_YUV2ARGB420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
34 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
35 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
36 | srcStride[1] *= 2; | ||
37 | srcStride[2] *= 2; | ||
38 | } | ||
39 | |||
40 | assert(srcStride[1] == srcStride[2]); | ||
41 | |||
42 | mlib_VideoColorYUV2ARGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, | ||
43 | srcSliceH, dstStride[0], srcStride[0], srcStride[1]); | ||
44 | return srcSliceH; | ||
45 | } | ||
46 | |||
47 | static int mlib_YUV2ABGR420_32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
48 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
49 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
50 | srcStride[1] *= 2; | ||
51 | srcStride[2] *= 2; | ||
52 | } | ||
53 | |||
54 | assert(srcStride[1] == srcStride[2]); | ||
55 | |||
56 | mlib_VideoColorYUV2ABGR420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, | ||
57 | srcSliceH, dstStride[0], srcStride[0], srcStride[1]); | ||
58 | return srcSliceH; | ||
59 | } | ||
60 | |||
61 | static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
62 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
63 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
64 | srcStride[1] *= 2; | ||
65 | srcStride[2] *= 2; | ||
66 | } | ||
67 | |||
68 | assert(srcStride[1] == srcStride[2]); | ||
69 | |||
70 | mlib_VideoColorYUV2RGB420(dst[0]+srcSliceY*dstStride[0], src[0], src[1], src[2], c->dstW, | ||
71 | srcSliceH, dstStride[0], srcStride[0], srcStride[1]); | ||
72 | return srcSliceH; | ||
73 | } | ||
74 | |||
75 | |||
76 | SwsFunc yuv2rgb_init_mlib(SwsContext *c) | ||
77 | { | ||
78 | switch(c->dstFormat){ | ||
79 | case PIX_FMT_RGB24: return mlib_YUV2RGB420_24; | ||
80 | case PIX_FMT_BGR32: return mlib_YUV2ARGB420_32; | ||
81 | case PIX_FMT_RGB32: return mlib_YUV2ABGR420_32; | ||
82 | default: return NULL; | ||
83 | } | ||
84 | } | ||
85 | |||
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_template.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_template.c deleted file mode 100644 index 1f8e225..0000000 --- a/src/plugins/ffmpeg/libswscale/yuv2rgb_template.c +++ /dev/null | |||
@@ -1,538 +0,0 @@ | |||
1 | /* | ||
2 | * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology" | ||
3 | * | ||
4 | * Copyright (C) 2000, Silicon Integrated System Corp. | ||
5 | * | ||
6 | * Author: Olie Lho <ollie@sis.com.tw> | ||
7 | * | ||
8 | * 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at) | ||
9 | * MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support) | ||
10 | * context / deglobalize stuff by Michael Niedermayer | ||
11 | * | ||
12 | * This file is part of mpeg2dec, a free MPEG-2 video decoder | ||
13 | * | ||
14 | * mpeg2dec is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License as published by | ||
16 | * the Free Software Foundation; either version 2, or (at your option) | ||
17 | * any later version. | ||
18 | * | ||
19 | * mpeg2dec is distributed in the hope that it will be useful, | ||
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
22 | * GNU General Public License for more details. | ||
23 | * | ||
24 | * You should have received a copy of the GNU General Public License | ||
25 | * along with mpeg2dec; if not, write to the Free Software | ||
26 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
27 | */ | ||
28 | |||
29 | #undef MOVNTQ | ||
30 | #undef EMMS | ||
31 | #undef SFENCE | ||
32 | |||
33 | #ifdef HAVE_3DNOW | ||
34 | /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */ | ||
35 | #define EMMS "femms" | ||
36 | #else | ||
37 | #define EMMS "emms" | ||
38 | #endif | ||
39 | |||
40 | #ifdef HAVE_MMX2 | ||
41 | #define MOVNTQ "movntq" | ||
42 | #define SFENCE "sfence" | ||
43 | #else | ||
44 | #define MOVNTQ "movq" | ||
45 | #define SFENCE "/nop" | ||
46 | #endif | ||
47 | |||
48 | #define YUV2RGB \ | ||
49 | /* Do the multiply part of the conversion for even and odd pixels, | ||
50 | register usage: | ||
51 | mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | ||
52 | mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | ||
53 | mm6 -> Y even, mm7 -> Y odd */\ | ||
54 | /* convert the chroma part */\ | ||
55 | "punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ | ||
56 | "punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ | ||
57 | \ | ||
58 | "psllw $3, %%mm0;" /* Promote precision */ \ | ||
59 | "psllw $3, %%mm1;" /* Promote precision */ \ | ||
60 | \ | ||
61 | "psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ | ||
62 | "psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ | ||
63 | \ | ||
64 | "movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ | ||
65 | "movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ | ||
66 | \ | ||
67 | "pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ | ||
68 | "pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ | ||
69 | \ | ||
70 | "pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ | ||
71 | "pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ | ||
72 | \ | ||
73 | "paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ | ||
74 | \ | ||
75 | /* convert the luma part */\ | ||
76 | "movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ | ||
77 | "pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ | ||
78 | \ | ||
79 | "psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ | ||
80 | \ | ||
81 | "psllw $3, %%mm6;" /* Promote precision */\ | ||
82 | "psllw $3, %%mm7;" /* Promote precision */\ | ||
83 | \ | ||
84 | "psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ | ||
85 | "psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ | ||
86 | \ | ||
87 | "pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ | ||
88 | "pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ | ||
89 | \ | ||
90 | /* Do the addition part of the conversion for even and odd pixels, | ||
91 | register usage: | ||
92 | mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels, | ||
93 | mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels, | ||
94 | mm6 -> Y even, mm7 -> Y odd */\ | ||
95 | "movq %%mm0, %%mm3;" /* Copy Cblue */\ | ||
96 | "movq %%mm1, %%mm4;" /* Copy Cred */\ | ||
97 | "movq %%mm2, %%mm5;" /* Copy Cgreen */\ | ||
98 | \ | ||
99 | "paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ | ||
100 | "paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ | ||
101 | \ | ||
102 | "paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ | ||
103 | "paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ | ||
104 | \ | ||
105 | "paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ | ||
106 | "paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ | ||
107 | \ | ||
108 | /* Limit RGB even to 0..255 */\ | ||
109 | "packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ | ||
110 | "packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ | ||
111 | "packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ | ||
112 | \ | ||
113 | /* Limit RGB odd to 0..255 */\ | ||
114 | "packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ | ||
115 | "packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ | ||
116 | "packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ | ||
117 | \ | ||
118 | /* Interleave RGB even and odd */\ | ||
119 | "punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ | ||
120 | "punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ | ||
121 | "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ | ||
122 | |||
123 | |||
124 | static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
125 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
126 | int y, h_size; | ||
127 | |||
128 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
129 | srcStride[1] *= 2; | ||
130 | srcStride[2] *= 2; | ||
131 | } | ||
132 | |||
133 | h_size= (c->dstW+7)&~7; | ||
134 | if(h_size*2 > FFABS(dstStride[0])) h_size-=8; | ||
135 | |||
136 | asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ ); | ||
137 | //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], | ||
138 | //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); | ||
139 | for (y= 0; y<srcSliceH; y++ ) { | ||
140 | uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; | ||
141 | uint8_t *py = src[0] + y*srcStride[0]; | ||
142 | uint8_t *pu = src[1] + (y>>1)*srcStride[1]; | ||
143 | uint8_t *pv = src[2] + (y>>1)*srcStride[2]; | ||
144 | long index= -h_size/2; | ||
145 | |||
146 | b5Dither= ff_dither8[y&1]; | ||
147 | g6Dither= ff_dither4[y&1]; | ||
148 | g5Dither= ff_dither8[y&1]; | ||
149 | r5Dither= ff_dither8[(y+1)&1]; | ||
150 | /* This MMX assembly code deals with a SINGLE scan line at a time, | ||
151 | * it converts 8 pixels in each iteration. */ | ||
152 | asm volatile ( | ||
153 | /* load data for start of next scan line */ | ||
154 | "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
155 | "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
156 | "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
157 | //".balign 16 \n\t" | ||
158 | "1: \n\t" | ||
159 | /* No speed difference on my p3@500 with prefetch, | ||
160 | * if it is faster for anyone with -benchmark then tell me. | ||
161 | PREFETCH" 64(%0) \n\t" | ||
162 | PREFETCH" 64(%1) \n\t" | ||
163 | PREFETCH" 64(%2) \n\t" | ||
164 | */ | ||
165 | YUV2RGB | ||
166 | |||
167 | #ifdef DITHER1XBPP | ||
168 | "paddusb "MANGLE(b5Dither)", %%mm0;" | ||
169 | "paddusb "MANGLE(g6Dither)", %%mm2;" | ||
170 | "paddusb "MANGLE(r5Dither)", %%mm1;" | ||
171 | #endif | ||
172 | /* mask unneeded bits off */ | ||
173 | "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ | ||
174 | "pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ | ||
175 | "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ | ||
176 | |||
177 | "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ | ||
178 | "pxor %%mm4, %%mm4;" /* zero mm4 */ | ||
179 | |||
180 | "movq %%mm0, %%mm5;" /* Copy B7-B0 */ | ||
181 | "movq %%mm2, %%mm7;" /* Copy G7-G0 */ | ||
182 | |||
183 | /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ | ||
184 | "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ | ||
185 | "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | ||
186 | |||
187 | "psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ | ||
188 | "por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ | ||
189 | |||
190 | "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
191 | MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ | ||
192 | |||
193 | /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ | ||
194 | "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ | ||
195 | "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | ||
196 | |||
197 | "psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ | ||
198 | "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
199 | |||
200 | "por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ | ||
201 | "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
202 | |||
203 | MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ | ||
204 | |||
205 | "add $16, %1 \n\t" | ||
206 | "add $4, %0 \n\t" | ||
207 | " js 1b \n\t" | ||
208 | |||
209 | : "+r" (index), "+r" (image) | ||
210 | : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) | ||
211 | ); | ||
212 | } | ||
213 | |||
214 | asm volatile (EMMS); | ||
215 | |||
216 | return srcSliceH; | ||
217 | } | ||
218 | |||
219 | static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
220 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
221 | int y, h_size; | ||
222 | |||
223 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
224 | srcStride[1] *= 2; | ||
225 | srcStride[2] *= 2; | ||
226 | } | ||
227 | |||
228 | h_size= (c->dstW+7)&~7; | ||
229 | if(h_size*2 > FFABS(dstStride[0])) h_size-=8; | ||
230 | |||
231 | asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ ); | ||
232 | //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0], | ||
233 | //srcStride[0],srcStride[1],srcStride[2],dstStride[0]); | ||
234 | for (y= 0; y<srcSliceH; y++ ) { | ||
235 | uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; | ||
236 | uint8_t *py = src[0] + y*srcStride[0]; | ||
237 | uint8_t *pu = src[1] + (y>>1)*srcStride[1]; | ||
238 | uint8_t *pv = src[2] + (y>>1)*srcStride[2]; | ||
239 | long index= -h_size/2; | ||
240 | |||
241 | b5Dither= ff_dither8[y&1]; | ||
242 | g6Dither= ff_dither4[y&1]; | ||
243 | g5Dither= ff_dither8[y&1]; | ||
244 | r5Dither= ff_dither8[(y+1)&1]; | ||
245 | /* This MMX assembly code deals with a SINGLE scan line at a time, | ||
246 | * it converts 8 pixels in each iteration. */ | ||
247 | asm volatile ( | ||
248 | /* load data for start of next scan line */ | ||
249 | "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
250 | "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
251 | "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
252 | //".balign 16 \n\t" | ||
253 | "1: \n\t" | ||
254 | YUV2RGB | ||
255 | |||
256 | #ifdef DITHER1XBPP | ||
257 | "paddusb "MANGLE(b5Dither)", %%mm0 \n\t" | ||
258 | "paddusb "MANGLE(g5Dither)", %%mm2 \n\t" | ||
259 | "paddusb "MANGLE(r5Dither)", %%mm1 \n\t" | ||
260 | #endif | ||
261 | |||
262 | /* mask unneeded bits off */ | ||
263 | "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ | ||
264 | "pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ | ||
265 | "pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ | ||
266 | |||
267 | "psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ | ||
268 | "psrlw $1, %%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ | ||
269 | "pxor %%mm4, %%mm4;" /* zero mm4 */ | ||
270 | |||
271 | "movq %%mm0, %%mm5;" /* Copy B7-B0 */ | ||
272 | "movq %%mm2, %%mm7;" /* Copy G7-G0 */ | ||
273 | |||
274 | /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ | ||
275 | "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ | ||
276 | "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | ||
277 | |||
278 | "psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ | ||
279 | "por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ | ||
280 | |||
281 | "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
282 | MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ | ||
283 | |||
284 | /* convert RGB24 plane to RGB16 pack for pixel 0-3 */ | ||
285 | "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ | ||
286 | "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ | ||
287 | |||
288 | "psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ | ||
289 | "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
290 | |||
291 | "por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ | ||
292 | "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
293 | |||
294 | MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ | ||
295 | |||
296 | "add $16, %1 \n\t" | ||
297 | "add $4, %0 \n\t" | ||
298 | " js 1b \n\t" | ||
299 | : "+r" (index), "+r" (image) | ||
300 | : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) | ||
301 | ); | ||
302 | } | ||
303 | |||
304 | asm volatile (EMMS); | ||
305 | return srcSliceH; | ||
306 | } | ||
307 | |||
308 | static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
309 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
310 | int y, h_size; | ||
311 | |||
312 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
313 | srcStride[1] *= 2; | ||
314 | srcStride[2] *= 2; | ||
315 | } | ||
316 | |||
317 | h_size= (c->dstW+7)&~7; | ||
318 | if(h_size*3 > FFABS(dstStride[0])) h_size-=8; | ||
319 | |||
320 | asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ ); | ||
321 | |||
322 | for (y= 0; y<srcSliceH; y++ ) { | ||
323 | uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; | ||
324 | uint8_t *py = src[0] + y*srcStride[0]; | ||
325 | uint8_t *pu = src[1] + (y>>1)*srcStride[1]; | ||
326 | uint8_t *pv = src[2] + (y>>1)*srcStride[2]; | ||
327 | long index= -h_size/2; | ||
328 | |||
329 | /* This MMX assembly code deals with a SINGLE scan line at a time, | ||
330 | * it converts 8 pixels in each iteration. */ | ||
331 | asm volatile ( | ||
332 | /* load data for start of next scan line */ | ||
333 | "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
334 | "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
335 | "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
336 | //".balign 16 \n\t" | ||
337 | "1: \n\t" | ||
338 | YUV2RGB | ||
339 | /* mm0=B, %%mm2=G, %%mm1=R */ | ||
340 | #ifdef HAVE_MMX2 | ||
341 | "movq "MANGLE(ff_M24A)", %%mm4 \n\t" | ||
342 | "movq "MANGLE(ff_M24C)", %%mm7 \n\t" | ||
343 | "pshufw $0x50, %%mm0, %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */ | ||
344 | "pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */ | ||
345 | "pshufw $0x00, %%mm1, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */ | ||
346 | |||
347 | "pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */ | ||
348 | "pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */ | ||
349 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */ | ||
350 | |||
351 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */ | ||
352 | "por %%mm5, %%mm6 \n\t" | ||
353 | "por %%mm3, %%mm6 \n\t" | ||
354 | MOVNTQ" %%mm6, (%1) \n\t" | ||
355 | |||
356 | "psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */ | ||
357 | "pshufw $0xA5, %%mm0, %%mm5 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */ | ||
358 | "pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */ | ||
359 | "pshufw $0xA5, %%mm1, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */ | ||
360 | |||
361 | "pand "MANGLE(ff_M24B)", %%mm5 \n\t" /* B5 B4 B3 */ | ||
362 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */ | ||
363 | "pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */ | ||
364 | |||
365 | "por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */ | ||
366 | "por %%mm3, %%mm6 \n\t" | ||
367 | MOVNTQ" %%mm6, 8(%1) \n\t" | ||
368 | |||
369 | "pshufw $0xFF, %%mm0, %%mm5 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */ | ||
370 | "pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */ | ||
371 | "pshufw $0xFA, %%mm1, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */ | ||
372 | "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
373 | |||
374 | "pand %%mm7, %%mm5 \n\t" /* B7 B6 */ | ||
375 | "pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */ | ||
376 | "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */ | ||
377 | "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
378 | \ | ||
379 | "por %%mm5, %%mm3 \n\t" | ||
380 | "por %%mm3, %%mm6 \n\t" | ||
381 | MOVNTQ" %%mm6, 16(%1) \n\t" | ||
382 | "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
383 | "pxor %%mm4, %%mm4 \n\t" | ||
384 | |||
385 | #else | ||
386 | |||
387 | "pxor %%mm4, %%mm4 \n\t" | ||
388 | "movq %%mm0, %%mm5 \n\t" /* B */ | ||
389 | "movq %%mm1, %%mm6 \n\t" /* R */ | ||
390 | "punpcklbw %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */ | ||
391 | "punpcklbw %%mm4, %%mm1 \n\t" /* 0R0R0R0R 0 */ | ||
392 | "punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */ | ||
393 | "punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */ | ||
394 | "movq %%mm0, %%mm7 \n\t" /* GBGBGBGB 0 */ | ||
395 | "movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */ | ||
396 | "punpcklwd %%mm1, %%mm7 \n\t" /* 0RGB0RGB 0 */ | ||
397 | "punpckhwd %%mm1, %%mm0 \n\t" /* 0RGB0RGB 1 */ | ||
398 | "punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */ | ||
399 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */ | ||
400 | |||
401 | "movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */ | ||
402 | "movq %%mm0, %%mm6 \n\t" /* 0RGB0RGB 1 */ | ||
403 | "movq %%mm5, %%mm1 \n\t" /* 0RGB0RGB 2 */ | ||
404 | "movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */ | ||
405 | |||
406 | "psllq $40, %%mm7 \n\t" /* RGB00000 0 */ | ||
407 | "psllq $40, %%mm0 \n\t" /* RGB00000 1 */ | ||
408 | "psllq $40, %%mm5 \n\t" /* RGB00000 2 */ | ||
409 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */ | ||
410 | |||
411 | "punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */ | ||
412 | "punpckhdq %%mm6, %%mm0 \n\t" /* 0RGBRGB0 1 */ | ||
413 | "punpckhdq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */ | ||
414 | "punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */ | ||
415 | |||
416 | "psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */ | ||
417 | "movq %%mm0, %%mm6 \n\t" /* 0RGBRGB0 1 */ | ||
418 | "psllq $40, %%mm0 \n\t" /* GB000000 1 */ | ||
419 | "por %%mm0, %%mm7 \n\t" /* GBRGBRGB 0 */ | ||
420 | MOVNTQ" %%mm7, (%1) \n\t" | ||
421 | |||
422 | "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
423 | |||
424 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */ | ||
425 | "movq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */ | ||
426 | "psllq $24, %%mm5 \n\t" /* BRGB0000 2 */ | ||
427 | "por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */ | ||
428 | MOVNTQ" %%mm6, 8(%1) \n\t" | ||
429 | |||
430 | "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
431 | |||
432 | "psrlq $40, %%mm1 \n\t" /* 000000RG 2 */ | ||
433 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */ | ||
434 | "por %%mm3, %%mm1 \n\t" /* RGBRGBRG 2 */ | ||
435 | MOVNTQ" %%mm1, 16(%1) \n\t" | ||
436 | |||
437 | "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
438 | "pxor %%mm4, %%mm4 \n\t" | ||
439 | #endif | ||
440 | |||
441 | "add $24, %1 \n\t" | ||
442 | "add $4, %0 \n\t" | ||
443 | " js 1b \n\t" | ||
444 | |||
445 | : "+r" (index), "+r" (image) | ||
446 | : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) | ||
447 | ); | ||
448 | } | ||
449 | |||
450 | asm volatile (EMMS); | ||
451 | return srcSliceH; | ||
452 | } | ||
453 | |||
454 | static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
455 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
456 | int y, h_size; | ||
457 | |||
458 | if(c->srcFormat == PIX_FMT_YUV422P){ | ||
459 | srcStride[1] *= 2; | ||
460 | srcStride[2] *= 2; | ||
461 | } | ||
462 | |||
463 | h_size= (c->dstW+7)&~7; | ||
464 | if(h_size*4 > FFABS(dstStride[0])) h_size-=8; | ||
465 | |||
466 | asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ ); | ||
467 | |||
468 | for (y= 0; y<srcSliceH; y++ ) { | ||
469 | uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; | ||
470 | uint8_t *py = src[0] + y*srcStride[0]; | ||
471 | uint8_t *pu = src[1] + (y>>1)*srcStride[1]; | ||
472 | uint8_t *pv = src[2] + (y>>1)*srcStride[2]; | ||
473 | long index= -h_size/2; | ||
474 | |||
475 | /* This MMX assembly code deals with a SINGLE scan line at a time, | ||
476 | * it converts 8 pixels in each iteration. */ | ||
477 | asm volatile ( | ||
478 | /* load data for start of next scan line */ | ||
479 | "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
480 | "movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
481 | "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
482 | //".balign 16 \n\t" | ||
483 | "1: \n\t" | ||
484 | YUV2RGB | ||
485 | /* convert RGB plane to RGB packed format, | ||
486 | mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, | ||
487 | mm4 -> GB, mm5 -> AR pixel 4-7, | ||
488 | mm6 -> GB, mm7 -> AR pixel 0-3 */ | ||
489 | "pxor %%mm3, %%mm3;" /* zero mm3 */ | ||
490 | |||
491 | "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | ||
492 | "movq %%mm1, %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ | ||
493 | |||
494 | "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | ||
495 | "movq %%mm1, %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ | ||
496 | |||
497 | "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ | ||
498 | "punpcklbw %%mm3, %%mm7;" /* 00 R3 00 R2 00 R1 00 R0 */ | ||
499 | |||
500 | "punpcklwd %%mm7, %%mm6;" /* 00 R1 B1 G1 00 R0 B0 G0 */ | ||
501 | MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ | ||
502 | |||
503 | "movq %%mm0, %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | ||
504 | "punpcklbw %%mm2, %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ | ||
505 | |||
506 | "punpckhwd %%mm7, %%mm6;" /* 00 R3 G3 B3 00 R2 B3 G2 */ | ||
507 | MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ | ||
508 | |||
509 | "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ | ||
510 | "punpckhbw %%mm3, %%mm5;" /* 00 R7 00 R6 00 R5 00 R4 */ | ||
511 | |||
512 | "punpcklwd %%mm5, %%mm4;" /* 00 R5 B5 G5 00 R4 B4 G4 */ | ||
513 | MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ | ||
514 | |||
515 | "movq %%mm0, %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ | ||
516 | "punpckhbw %%mm2, %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ | ||
517 | |||
518 | "punpckhwd %%mm5, %%mm4;" /* 00 R7 G7 B7 00 R6 B6 G6 */ | ||
519 | MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ | ||
520 | |||
521 | "movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ | ||
522 | "movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ | ||
523 | |||
524 | "pxor %%mm4, %%mm4;" /* zero mm4 */ | ||
525 | "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ | ||
526 | |||
527 | "add $32, %1 \n\t" | ||
528 | "add $4, %0 \n\t" | ||
529 | " js 1b \n\t" | ||
530 | |||
531 | : "+r" (index), "+r" (image) | ||
532 | : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) | ||
533 | ); | ||
534 | } | ||
535 | |||
536 | asm volatile (EMMS); | ||
537 | return srcSliceH; | ||
538 | } | ||
diff --git a/src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c b/src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c deleted file mode 100644 index 120fa56..0000000 --- a/src/plugins/ffmpeg/libswscale/yuv2rgb_vis.c +++ /dev/null | |||
@@ -1,207 +0,0 @@ | |||
1 | /* | ||
2 | * VIS optimized software YUV to RGB converter | ||
3 | * Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu> | ||
4 | * | ||
5 | * This file is part of FFmpeg. | ||
6 | * | ||
7 | * FFmpeg is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU Lesser General Public | ||
9 | * License as published by the Free Software Foundation; either | ||
10 | * version 2.1 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * FFmpeg is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
15 | * Lesser General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU Lesser General Public | ||
18 | * License along with FFmpeg; if not, write to the Free Software | ||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
20 | */ | ||
21 | |||
22 | #include <inttypes.h> | ||
23 | #include <stdlib.h> | ||
24 | |||
25 | #include "swscale.h" | ||
26 | #include "swscale_internal.h" | ||
27 | |||
28 | #define YUV2RGB_INIT \ | ||
29 | "wr %%g0, 0x10, %%gsr \n\t" \ | ||
30 | "ldd [%5], %%f32 \n\t" \ | ||
31 | "ldd [%5+8], %%f34 \n\t" \ | ||
32 | "ldd [%5+16], %%f36 \n\t" \ | ||
33 | "ldd [%5+24], %%f38 \n\t" \ | ||
34 | "ldd [%5+32], %%f40 \n\t" \ | ||
35 | "ldd [%5+40], %%f42 \n\t" \ | ||
36 | "ldd [%5+48], %%f44 \n\t" \ | ||
37 | "ldd [%5+56], %%f46 \n\t" \ | ||
38 | "ldd [%5+64], %%f48 \n\t" \ | ||
39 | "ldd [%5+72], %%f50 \n\t" | ||
40 | |||
41 | #define YUV2RGB_KERNEL \ | ||
42 | /* ^^^^ f0=Y f3=u f5=v */ \ | ||
43 | "fmul8x16 %%f3, %%f48, %%f6 \n\t" \ | ||
44 | "fmul8x16 %%f19, %%f48, %%f22 \n\t" \ | ||
45 | "fmul8x16 %%f5, %%f44, %%f8 \n\t" \ | ||
46 | "fmul8x16 %%f21, %%f44, %%f24 \n\t" \ | ||
47 | "fmul8x16 %%f0, %%f42, %%f0 \n\t" \ | ||
48 | "fmul8x16 %%f16, %%f42, %%f16 \n\t" \ | ||
49 | "fmul8x16 %%f3, %%f50, %%f2 \n\t" \ | ||
50 | "fmul8x16 %%f19, %%f50, %%f18 \n\t" \ | ||
51 | "fmul8x16 %%f5, %%f46, %%f4 \n\t" \ | ||
52 | "fmul8x16 %%f21, %%f46, %%f20 \n\t" \ | ||
53 | \ | ||
54 | "fpsub16 %%f6, %%f34, %%f6 \n\t" /* 1 */ \ | ||
55 | "fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \ | ||
56 | "fpsub16 %%f8, %%f38, %%f8 \n\t" /* 3 */ \ | ||
57 | "fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \ | ||
58 | "fpsub16 %%f0, %%f32, %%f0 \n\t" /* 0 */ \ | ||
59 | "fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \ | ||
60 | "fpsub16 %%f2, %%f36, %%f2 \n\t" /* 2 */ \ | ||
61 | "fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \ | ||
62 | "fpsub16 %%f4, %%f40, %%f4 \n\t" /* 4 */ \ | ||
63 | "fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \ | ||
64 | \ | ||
65 | "fpadd16 %%f0, %%f8, %%f8 \n\t" /* Gt */ \ | ||
66 | "fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \ | ||
67 | "fpadd16 %%f0, %%f4, %%f4 \n\t" /* R */ \ | ||
68 | "fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \ | ||
69 | "fpadd16 %%f0, %%f6, %%f6 \n\t" /* B */ \ | ||
70 | "fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \ | ||
71 | "fpadd16 %%f8, %%f2, %%f2 \n\t" /* G */ \ | ||
72 | "fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \ | ||
73 | \ | ||
74 | "fpack16 %%f4, %%f4 \n\t" \ | ||
75 | "fpack16 %%f20, %%f20 \n\t" \ | ||
76 | "fpack16 %%f6, %%f6 \n\t" \ | ||
77 | "fpack16 %%f22, %%f22 \n\t" \ | ||
78 | "fpack16 %%f2, %%f2 \n\t" \ | ||
79 | "fpack16 %%f18, %%f18 \n\t" | ||
80 | |||
81 | |||
82 | |||
83 | static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
84 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
85 | int y, out1, out2, out3, out4, out5, out6; | ||
86 | |||
87 | for(y=0;y < srcSliceH;++y) { | ||
88 | asm volatile ( | ||
89 | YUV2RGB_INIT | ||
90 | "wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */ | ||
91 | "1: \n\t" | ||
92 | "ldda [%1] %%asi, %%f2 \n\t" | ||
93 | "ldda [%1+2] %%asi, %%f18 \n\t" | ||
94 | "ldda [%2] %%asi, %%f4 \n\t" | ||
95 | "ldda [%2+2] %%asi, %%f20 \n\t" | ||
96 | "ld [%0], %%f0 \n\t" | ||
97 | "ld [%0+4], %%f16 \n\t" | ||
98 | "fpmerge %%f3, %%f3, %%f2 \n\t" | ||
99 | "fpmerge %%f19, %%f19, %%f18 \n\t" | ||
100 | "fpmerge %%f5, %%f5, %%f4 \n\t" | ||
101 | "fpmerge %%f21, %%f21, %%f20 \n\t" | ||
102 | YUV2RGB_KERNEL | ||
103 | "fzero %%f0 \n\t" | ||
104 | "fpmerge %%f4, %%f6, %%f8 \n\t" // r,b,t1 | ||
105 | "fpmerge %%f20, %%f22, %%f24 \n\t" // r,b,t1 | ||
106 | "fpmerge %%f0, %%f2, %%f10 \n\t" // 0,g,t2 | ||
107 | "fpmerge %%f0, %%f18, %%f26 \n\t" // 0,g,t2 | ||
108 | "fpmerge %%f10, %%f8, %%f4 \n\t" // t2,t1,msb | ||
109 | "fpmerge %%f26, %%f24, %%f20 \n\t" // t2,t1,msb | ||
110 | "fpmerge %%f11, %%f9, %%f6 \n\t" // t2,t1,lsb | ||
111 | "fpmerge %%f27, %%f25, %%f22 \n\t" // t2,t1,lsb | ||
112 | "std %%f4, [%3] \n\t" | ||
113 | "std %%f20, [%3+16] \n\t" | ||
114 | "std %%f6, [%3+8] \n\t" | ||
115 | "std %%f22, [%3+24] \n\t" | ||
116 | |||
117 | "add %0, 8, %0 \n\t" | ||
118 | "add %1, 4, %1 \n\t" | ||
119 | "add %2, 4, %2 \n\t" | ||
120 | "subcc %4, 8, %4 \n\t" | ||
121 | "bne 1b \n\t" | ||
122 | "add %3, 32, %3 \n\t" //delay slot | ||
123 | : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6) | ||
124 | : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+((y+srcSliceY)>>1)*srcStride[1]), | ||
125 | "2" (src[2]+((y+srcSliceY)>>1)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]), | ||
126 | "4" (c->dstW), | ||
127 | "5" (c->sparc_coeffs) | ||
128 | ); | ||
129 | } | ||
130 | |||
131 | return srcSliceH; | ||
132 | } | ||
133 | |||
134 | static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, | ||
135 | int srcSliceH, uint8_t* dst[], int dstStride[]){ | ||
136 | int y, out1, out2, out3, out4, out5, out6; | ||
137 | |||
138 | for(y=0;y < srcSliceH;++y) { | ||
139 | asm volatile ( | ||
140 | YUV2RGB_INIT | ||
141 | "wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */ | ||
142 | "1: \n\t" | ||
143 | "ldda [%1] %%asi, %%f2 \n\t" | ||
144 | "ldda [%1+2] %%asi, %%f18 \n\t" | ||
145 | "ldda [%2] %%asi, %%f4 \n\t" | ||
146 | "ldda [%2+2] %%asi, %%f20 \n\t" | ||
147 | "ld [%0], %%f0 \n\t" | ||
148 | "ld [%0+4], %%f16 \n\t" | ||
149 | "fpmerge %%f3, %%f3, %%f2 \n\t" | ||
150 | "fpmerge %%f19, %%f19, %%f18 \n\t" | ||
151 | "fpmerge %%f5, %%f5, %%f4 \n\t" | ||
152 | "fpmerge %%f21, %%f21, %%f20 \n\t" | ||
153 | YUV2RGB_KERNEL | ||
154 | "fzero %%f0 \n\t" | ||
155 | "fpmerge %%f4, %%f6, %%f8 \n\t" // r,b,t1 | ||
156 | "fpmerge %%f20, %%f22, %%f24 \n\t" // r,b,t1 | ||
157 | "fpmerge %%f0, %%f2, %%f10 \n\t" // 0,g,t2 | ||
158 | "fpmerge %%f0, %%f18, %%f26 \n\t" // 0,g,t2 | ||
159 | "fpmerge %%f10, %%f8, %%f4 \n\t" // t2,t1,msb | ||
160 | "fpmerge %%f26, %%f24, %%f20 \n\t" // t2,t1,msb | ||
161 | "fpmerge %%f11, %%f9, %%f6 \n\t" // t2,t1,lsb | ||
162 | "fpmerge %%f27, %%f25, %%f22 \n\t" // t2,t1,lsb | ||
163 | "std %%f4, [%3] \n\t" | ||
164 | "std %%f20, [%3+16] \n\t" | ||
165 | "std %%f6, [%3+8] \n\t" | ||
166 | "std %%f22, [%3+24] \n\t" | ||
167 | |||
168 | "add %0, 8, %0 \n\t" | ||
169 | "add %1, 4, %1 \n\t" | ||
170 | "add %2, 4, %2 \n\t" | ||
171 | "subcc %4, 8, %4 \n\t" | ||
172 | "bne 1b \n\t" | ||
173 | "add %3, 32, %3 \n\t" //delay slot | ||
174 | : "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6) | ||
175 | : "0" (src[0]+(y+srcSliceY)*srcStride[0]), "1" (src[1]+(y+srcSliceY)*srcStride[1]), | ||
176 | "2" (src[2]+(y+srcSliceY)*srcStride[2]), "3" (dst[0]+(y+srcSliceY)*dstStride[0]), | ||
177 | "4" (c->dstW), | ||
178 | "5" (c->sparc_coeffs) | ||
179 | ); | ||
180 | } | ||
181 | |||
182 | return srcSliceH; | ||
183 | } | ||
184 | |||
185 | SwsFunc yuv2rgb_init_vis(SwsContext *c) { | ||
186 | c->sparc_coeffs[5]=c->yCoeff; | ||
187 | c->sparc_coeffs[6]=c->vgCoeff; | ||
188 | c->sparc_coeffs[7]=c->vrCoeff; | ||
189 | c->sparc_coeffs[8]=c->ubCoeff; | ||
190 | c->sparc_coeffs[9]=c->ugCoeff; | ||
191 | |||
192 | c->sparc_coeffs[0]=(((int16_t)c->yOffset*(int16_t)c->yCoeff >>11) & 0xffff) * 0x0001000100010001ULL; | ||
193 | c->sparc_coeffs[1]=(((int16_t)c->uOffset*(int16_t)c->ubCoeff>>11) & 0xffff) * 0x0001000100010001ULL; | ||
194 | c->sparc_coeffs[2]=(((int16_t)c->uOffset*(int16_t)c->ugCoeff>>11) & 0xffff) * 0x0001000100010001ULL; | ||
195 | c->sparc_coeffs[3]=(((int16_t)c->vOffset*(int16_t)c->vgCoeff>>11) & 0xffff) * 0x0001000100010001ULL; | ||
196 | c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL; | ||
197 | |||
198 | if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) { | ||
199 | av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32\n"); | ||
200 | return vis_422P_ARGB32; | ||
201 | } | ||
202 | else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) { | ||
203 | av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32\n"); | ||
204 | return vis_420P_ARGB32; | ||
205 | } | ||
206 | return NULL; | ||
207 | } | ||