summaryrefslogtreecommitdiff
path: root/contrib/ffmpeg/libavcodec/ppc
diff options
context:
space:
mode:
authorDiego 'Flameeyes' Pettenò <flameeyes@gmail.com>2008-03-01 03:05:13 +0100
committerDiego 'Flameeyes' Pettenò <flameeyes@gmail.com>2008-03-01 03:05:13 +0100
commit1d0b3b20c34517b9d1ddf3ea347776304b0c4b44 (patch)
tree89f4fc640c2becc6f00ae08996754952ecf149c1 /contrib/ffmpeg/libavcodec/ppc
parent09496ad3469a0ade8dbd9a351e639b78f20b7942 (diff)
downloadxine-lib-1d0b3b20c34517b9d1ddf3ea347776304b0c4b44.tar.gz
xine-lib-1d0b3b20c34517b9d1ddf3ea347776304b0c4b44.tar.bz2
Update internal FFmpeg copy.
Diffstat (limited to 'contrib/ffmpeg/libavcodec/ppc')
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/check_altivec.c75
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c199
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h80
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c12
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h14
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c4
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/fft_altivec.c6
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/float_altivec.c2
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h31
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c15
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/h264_altivec.c601
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c613
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/idct_altivec.c7
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c153
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h26
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/int_altivec.c80
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/mathops.h5
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c82
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c87
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/snow_altivec.c61
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/types_altivec.h5
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/util_altivec.h105
-rw-r--r--contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c88
23 files changed, 1450 insertions, 901 deletions
diff --git a/contrib/ffmpeg/libavcodec/ppc/check_altivec.c b/contrib/ffmpeg/libavcodec/ppc/check_altivec.c
new file mode 100644
index 000000000..cf55b9a1d
--- /dev/null
+++ b/contrib/ffmpeg/libavcodec/ppc/check_altivec.c
@@ -0,0 +1,75 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+/**
+ * @file check_altivec.c
+ * Checks for AltiVec presence.
+ */
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#elif __AMIGAOS4__
+#include <exec/exec.h>
+#include <interfaces/exec.h>
+#include <proto/exec.h>
+#endif /* __APPLE__ */
+
+/**
+ * This function MAY rely on signal() or fork() in order to make sure altivec
+ * is present
+ */
+
+int has_altivec(void)
+{
+#ifdef __AMIGAOS4__
+ ULONG result = 0;
+ extern struct ExecIFace *IExec;
+
+ IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
+ if (result == VECTORTYPE_ALTIVEC) return 1;
+ return 0;
+#elif __APPLE__
+ int sels[2] = {CTL_HW, HW_VECTORUNIT};
+ int has_vu = 0;
+ size_t len = sizeof(has_vu);
+ int err;
+
+ err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+ if (err == 0) return (has_vu != 0);
+ return 0;
+#elif defined(RUNTIME_CPUDETECT)
+ int proc_ver;
+ // support of mfspr PVR emulation added in Linux 2.6.17
+ asm volatile("mfspr %0, 287" : "=r" (proc_ver));
+ proc_ver >>= 16;
+ if (proc_ver & 0x8000 ||
+ proc_ver == 0x000c ||
+ proc_ver == 0x0039 || proc_ver == 0x003c ||
+ proc_ver == 0x0044 || proc_ver == 0x0045 ||
+ proc_ver == 0x0070)
+ return 1;
+ return 0;
+#else
+ // since we were compiled for altivec, just assume we have it
+ // until someone comes up with a proper way (not involving signal hacks).
+ return 1;
+#endif /* __AMIGAOS4__ */
+}
+
diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c
index bbc53d761..3d79c3ab5 100644
--- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -20,44 +20,18 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
-#include "dsputil_altivec.h"
-
-#ifdef CONFIG_DARWIN
-#include <sys/sysctl.h>
-#else /* CONFIG_DARWIN */
-#ifdef __AMIGAOS4__
-#include <exec/exec.h>
-#include <interfaces/exec.h>
-#include <proto/exec.h>
-#else /* __AMIGAOS4__ */
-#include <signal.h>
-#include <setjmp.h>
-
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler (int sig)
-{
- if (!canjump) {
- signal (sig, SIG_DFL);
- raise (sig);
- }
-
- canjump = 0;
- siglongjmp (jmpbuf, 1);
-}
-#endif /* CONFIG_DARWIN */
-#endif /* __AMIGAOS4__ */
+#include "dsputil_ppc.h"
+#include "util_altivec.h"
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector unsigned char *tv;
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
vector unsigned int sad;
@@ -103,8 +77,8 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector unsigned char *tv;
vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
vector unsigned int sad;
@@ -163,10 +137,10 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
+ DECLARE_ALIGNED_16(int, s);
uint8_t *pix3 = pix2 + line_size;
- const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
- const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+ const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
vector unsigned char *tv, avgv, t5;
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
@@ -218,7 +192,7 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
/*
- Note that Altivec does have vec_avg, but this works on vector pairs
+ Note that AltiVec does have vec_avg, but this works on vector pairs
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
Instead, we have to split the pixel vectors into vectors of shorts,
@@ -264,8 +238,8 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int
int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, *pix1v, *pix2v;
vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sad;
@@ -306,8 +280,8 @@ int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sad;
@@ -351,8 +325,8 @@ int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
int pix_norm1_altivec(uint8_t *pix, int line_size)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char *tv;
vector unsigned char pixv;
vector unsigned int sv;
@@ -387,8 +361,8 @@ int pix_norm1_altivec(uint8_t *pix, int line_size)
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sum;
@@ -443,8 +417,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
{
int i;
- int s __attribute__((aligned(16)));
- const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
+ DECLARE_ALIGNED_16(int, s);
+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm1, perm2, *pix1v, *pix2v;
vector unsigned char t1, t2, t3,t4, t5;
vector unsigned int sum;
@@ -488,14 +462,14 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
int pix_sum_altivec(uint8_t * pix, int line_size)
{
- const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
vector unsigned char perm, *pixv;
vector unsigned char t1;
vector unsigned int sad;
vector signed int sumdiffs;
int i;
- int s __attribute__((aligned(16)));
+ DECLARE_ALIGNED_16(int, s);
sad = (vector unsigned int)vec_splat_u32(0);
@@ -523,7 +497,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line
{
int i;
vector unsigned char perm, bytes, *pixv;
- const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts;
for(i=0;i<8;i++)
@@ -550,7 +524,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
{
int i;
vector unsigned char perm, bytes, *pixv;
- const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
vector signed short shorts1, shorts2;
for(i=0;i<4;i++)
@@ -769,8 +743,8 @@ POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
blockv, temp1, temp2;
register vector unsigned short
pixelssum1, pixelssum2, temp3;
- register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
- register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels);
@@ -845,9 +819,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
blockv, temp1, temp2;
register vector unsigned short
pixelssum1, pixelssum2, temp3;
- register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
- register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
- register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
temp1 = vec_ld(0, pixels);
temp2 = vec_ld(16, pixels);
@@ -922,8 +896,8 @@ POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
register vector unsigned short
pixelssum1, pixelssum2, temp3,
pixelssum3, pixelssum4, temp4;
- register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
- register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
@@ -1004,9 +978,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
register vector unsigned short
pixelssum1, pixelssum2, temp3,
pixelssum3, pixelssum4, temp4;
- register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
- register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
- register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
@@ -1078,25 +1052,25 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
int sum;
- register const_vector unsigned char vzero =
- (const_vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned char vzero =
+ (const vector unsigned char)vec_splat_u8(0);
register vector signed short temp0, temp1, temp2, temp3, temp4,
temp5, temp6, temp7;
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
{
- register const_vector signed short vprod1 =(const_vector signed short)
+ register const vector signed short vprod1 =(const vector signed short)
AVV( 1,-1, 1,-1, 1,-1, 1,-1);
- register const_vector signed short vprod2 =(const_vector signed short)
+ register const vector signed short vprod2 =(const vector signed short)
AVV( 1, 1,-1,-1, 1, 1,-1,-1);
- register const_vector signed short vprod3 =(const_vector signed short)
+ register const vector signed short vprod3 =(const vector signed short)
AVV( 1, 1, 1, 1,-1,-1,-1,-1);
- register const_vector unsigned char perm1 = (const_vector unsigned char)
+ register const vector unsigned char perm1 = (const vector unsigned char)
AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
- register const_vector unsigned char perm2 = (const_vector unsigned char)
+ register const vector unsigned char perm2 = (const vector unsigned char)
AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
- register const_vector unsigned char perm3 = (const_vector unsigned char)
+ register const vector unsigned char perm3 = (const vector unsigned char)
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
@@ -1120,7 +1094,7 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
dstV = \
(vector signed short)vec_mergeh((vector signed char)vzero, \
(vector signed char)dstO); \
- /* substractions inside the first butterfly */ \
+ /* subtractions inside the first butterfly */ \
but0 = vec_sub(srcV, dstV); \
op1 = vec_perm(but0, but0, perm1); \
but1 = vec_mladd(but0, vprod1, op1); \
@@ -1201,7 +1175,7 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
schedule for the 7450, and its code isn't much faster than
gcc-3.3 on the 7450 (but uses 25% less instructions...)
- On the 970, the hand-made RA is still a win (arount 690
+ On the 970, the hand-made RA is still a win (around 690
vs. around 780), but xlc goes to around 660 on the
regular C code...
*/
@@ -1226,25 +1200,25 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
temp5S REG_v(v13),
temp6S REG_v(v14),
temp7S REG_v(v15);
- register const_vector unsigned char vzero REG_v(v31)=
- (const_vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned char vzero REG_v(v31)=
+ (const vector unsigned char)vec_splat_u8(0);
{
- register const_vector signed short vprod1 REG_v(v16)=
- (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
- register const_vector signed short vprod2 REG_v(v17)=
- (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
- register const_vector signed short vprod3 REG_v(v18)=
- (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
- register const_vector unsigned char perm1 REG_v(v19)=
- (const_vector unsigned char)
+ register const vector signed short vprod1 REG_v(v16)=
+ (const vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
+ register const vector signed short vprod2 REG_v(v17)=
+ (const vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
+ register const vector signed short vprod3 REG_v(v18)=
+ (const vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
+ register const vector unsigned char perm1 REG_v(v19)=
+ (const vector unsigned char)
AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
- register const_vector unsigned char perm2 REG_v(v20)=
- (const_vector unsigned char)
+ register const vector unsigned char perm2 REG_v(v20)=
+ (const vector unsigned char)
AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
- register const_vector unsigned char perm3 REG_v(v21)=
- (const_vector unsigned char)
+ register const vector unsigned char perm3 REG_v(v21)=
+ (const vector unsigned char)
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
@@ -1293,7 +1267,7 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst,
dstW = \
(vector signed short)vec_mergel((vector signed char)vzero, \
(vector signed char)dstO); \
- /* substractions inside the first butterfly */ \
+ /* subtractions inside the first butterfly */ \
but0 = vec_sub(srcV, dstV); \
but0S = vec_sub(srcW, dstW); \
op1 = vec_perm(but0, but0, perm1); \
@@ -1419,50 +1393,6 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
return score;
}
-int has_altivec(void)
-{
-#ifdef __AMIGAOS4__
- ULONG result = 0;
- extern struct ExecIFace *IExec;
-
- IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
- if (result == VECTORTYPE_ALTIVEC) return 1;
- return 0;
-#else /* __AMIGAOS4__ */
-
-#ifdef CONFIG_DARWIN
- int sels[2] = {CTL_HW, HW_VECTORUNIT};
- int has_vu = 0;
- size_t len = sizeof(has_vu);
- int err;
-
- err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
-
- if (err == 0) return (has_vu != 0);
-#else /* CONFIG_DARWIN */
-/* no Darwin, do it the brute-force way */
-/* this is borrowed from the libmpeg2 library */
- {
- signal (SIGILL, sigill_handler);
- if (sigsetjmp (jmpbuf, 1)) {
- signal (SIGILL, SIG_DFL);
- } else {
- canjump = 1;
-
- asm volatile ("mtspr 256, %0\n\t"
- "vand %%v0, %%v0, %%v0"
- :
- : "r" (-1));
-
- signal (SIGILL, SIG_DFL);
- return 1;
- }
- }
-#endif /* CONFIG_DARWIN */
- return 0;
-#endif /* __AMIGAOS4__ */
-}
-
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
int blocksize)
{
@@ -1495,9 +1425,9 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
register vector unsigned char blockv, temp1, temp2, blocktemp;
register vector unsigned short pixelssum1, pixelssum2, temp3;
- register const_vector unsigned char vczero = (const_vector unsigned char)
+ register const vector unsigned char vczero = (const vector unsigned char)
vec_splat_u8(0);
- register const_vector unsigned short vctwo = (const_vector unsigned short)
+ register const vector unsigned short vctwo = (const vector unsigned short)
vec_splat_u16(2);
temp1 = vec_ld(0, pixels);
@@ -1583,7 +1513,6 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
c->hadamard8_diff[0] = hadamard8_diff16_altivec;
c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
-#ifdef CONFIG_VORBIS_DECODER
- c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
-#endif
+ if (ENABLE_VORBIS_DECODER)
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
}
diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h
index 560d778bb..43bd5abab 100644
--- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h
+++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h
@@ -20,12 +20,10 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef _DSPUTIL_ALTIVEC_
-#define _DSPUTIL_ALTIVEC_
+#ifndef FFMPEG_DSPUTIL_ALTIVEC_H
+#define FFMPEG_DSPUTIL_ALTIVEC_H
-#include "dsputil_ppc.h"
-
-#ifdef HAVE_ALTIVEC
+#include <stdint.h>
extern int has_altivec(void);
@@ -33,74 +31,4 @@ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size,
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-// used to build registers permutation vectors (vcprm)
-// the 's' are for words in the _s_econd vector
-#define WORD_0 0x00,0x01,0x02,0x03
-#define WORD_1 0x04,0x05,0x06,0x07
-#define WORD_2 0x08,0x09,0x0a,0x0b
-#define WORD_3 0x0c,0x0d,0x0e,0x0f
-#define WORD_s0 0x10,0x11,0x12,0x13
-#define WORD_s1 0x14,0x15,0x16,0x17
-#define WORD_s2 0x18,0x19,0x1a,0x1b
-#define WORD_s3 0x1c,0x1d,0x1e,0x1f
-
-#ifdef CONFIG_DARWIN
-#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
-#else
-#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
-#endif
-
-// vcprmle is used to keep the same index as in the SSE version.
-// it's the same as vcprm, with the index inversed
-// ('le' is Little Endian)
-#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
-
-// used to build inverse/identity vectors (vcii)
-// n is _n_egative, p is _p_ositive
-#define FLOAT_n -1.
-#define FLOAT_p 1.
-
-
-#ifdef CONFIG_DARWIN
-#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
-#else
-#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
-#endif
-
-// Transpose 8x8 matrix of 16-bit elements (in-place)
-#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
-do { \
- vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
- vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
- \
- A1 = vec_mergeh (a, e); \
- B1 = vec_mergel (a, e); \
- C1 = vec_mergeh (b, f); \
- D1 = vec_mergel (b, f); \
- E1 = vec_mergeh (c, g); \
- F1 = vec_mergel (c, g); \
- G1 = vec_mergeh (d, h); \
- H1 = vec_mergel (d, h); \
- \
- A2 = vec_mergeh (A1, E1); \
- B2 = vec_mergel (A1, E1); \
- C2 = vec_mergeh (B1, F1); \
- D2 = vec_mergel (B1, F1); \
- E2 = vec_mergeh (C1, G1); \
- F2 = vec_mergel (C1, G1); \
- G2 = vec_mergeh (D1, H1); \
- H2 = vec_mergel (D1, H1); \
- \
- a = vec_mergeh (A2, E2); \
- b = vec_mergel (A2, E2); \
- c = vec_mergeh (B2, F2); \
- d = vec_mergel (B2, F2); \
- e = vec_mergeh (C2, G2); \
- f = vec_mergel (C2, G2); \
- g = vec_mergeh (D2, H2); \
- h = vec_mergel (D2, H2); \
-} while (0)
-
-#endif /* HAVE_ALTIVEC */
-
-#endif /* _DSPUTIL_ALTIVEC_ */
+#endif /* FFMPEG_DSPUTIL_ALTIVEC_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c
index 117a7adf1..13dea06a1 100644
--- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c
+++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c
@@ -20,7 +20,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "dsputil_ppc.h"
@@ -39,6 +39,7 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
+void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
#endif
@@ -154,11 +155,7 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
i += 16;
}
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
-#ifndef __MWERKS__
asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
-#else
- __dcbz( blocks, i );
-#endif
}
if (misal) {
((unsigned long*)blocks)[188] = 0L;
@@ -213,7 +210,7 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
knows about dcbzl ... */
long check_dcbzl_effect(void)
{
- register char *fakedata = (char*)av_malloc(1024);
+ register char *fakedata = av_malloc(1024);
register char *fakedata_middle;
register long zero = 0;
register long i = 0;
@@ -260,7 +257,7 @@ static void prefetch_ppc(void *mem, int stride, int h)
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
{
- // Common optimizations whether Altivec is available or not
+ // Common optimizations whether AltiVec is available or not
c->prefetch = prefetch_ppc;
switch (check_dcbzl_effect()) {
case 32:
@@ -284,6 +281,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
if(ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
vc1dsp_init_altivec(c, avctx);
float_init_altivec(c, avctx);
+ int_init_altivec(c, avctx);
c->gmc1 = gmc1_altivec;
#ifdef CONFIG_ENCODERS
diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h
index 5b25732b2..d8f6b27f9 100644
--- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h
+++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h
@@ -18,14 +18,14 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef _DSPUTIL_PPC_
-#define _DSPUTIL_PPC_
+#ifndef FFMPEG_DSPUTIL_PPC_H
+#define FFMPEG_DSPUTIL_PPC_H
#ifdef CONFIG_POWERPC_PERF
void powerpc_display_perf_report(void);
/* the 604* have 2, the G3* have 4, the G4s have 6,
and the G5 are completely different (they MUST use
- POWERPC_MODE_64BITS, and let's hope all future 64 bis PPC
+ HAVE_PPC64, and let's hope all future 64 bis PPC
will use the same PMCs... */
#define POWERPC_NUM_PMC_ENABLED 6
/* if you add to the enum below, also add to the perfname array
@@ -68,7 +68,7 @@ enum powerpc_data_index {
};
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
-#ifndef POWERPC_MODE_64BITS
+#ifndef HAVE_PPC64
#define POWERP_PMC_DATATYPE unsigned long
#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
@@ -86,7 +86,7 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
-#else /* POWERPC_MODE_64BITS */
+#else /* HAVE_PPC64 */
#define POWERP_PMC_DATATYPE unsigned long long
#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
@@ -104,7 +104,7 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
-#endif /* POWERPC_MODE_64BITS */
+#endif /* HAVE_PPC64 */
#define POWERPC_PERF_DECLARE(a, cond) \
POWERP_PMC_DATATYPE \
pmc_start[POWERPC_NUM_PMC_ENABLED], \
@@ -152,4 +152,4 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0)
#endif /* CONFIG_POWERPC_PERF */
-#endif /* _DSPUTIL_PPC_ */
+#endif /* FFMPEG_DSPUTIL_PPC_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c
index 2418c32bb..6b9a35ba8 100644
--- a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c
@@ -21,8 +21,8 @@
#include "common.h"
-#include "../dsputil.h"
-#include "dsputil_altivec.h"
+#include "dsputil.h"
+#include "dsputil_ppc.h"
#include "gcc_fixes.h"
diff --git a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c
index 384a774ff..e0b77807f 100644
--- a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c
@@ -20,12 +20,12 @@
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
-#include "dsputil_altivec.h"
-
+#include "dsputil_ppc.h"
+#include "util_altivec.h"
/*
those three macros are from libavcodec/fft.c
and are required for the reference C code
diff --git a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c
index 22c2de61a..750e6d7f9 100644
--- a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c
@@ -18,7 +18,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
diff --git a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h b/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h
index 5a4a55188..b8a908a61 100644
--- a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h
+++ b/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h
@@ -20,31 +20,22 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef _GCC_FIXES_
-#define _GCC_FIXES_
+#ifndef FFMPEG_GCC_FIXES_H
+#define FFMPEG_GCC_FIXES_H
+
+#include "config.h"
#ifdef HAVE_ALTIVEC_H
#include <altivec.h>
#endif
-#ifdef CONFIG_DARWIN
-# ifndef __MWERKS__
-# define AVV(x...) (x)
-# else
-# define AVV
-# endif
-#define REG_v(a) asm ( #a )
-#else
-
-#define AVV(x...) {x}
-
#if (__GNUC__ < 4)
# define REG_v(a)
#else
# define REG_v(a) asm ( #a )
#endif
-#if (__GNUC__ * 100 + __GNUC_MINOR__ < 303)
+#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
/* This code was provided to me by Bartosch Pixa
* as a separate header file (broken_mergel.h).
@@ -106,14 +97,6 @@ __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \
((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
__altivec_link_error_invalid_argument ())))))))
-#endif
-
-#endif /* CONFIG_DARWIN */
-
-#ifndef __MWERKS__
-#define const_vector const vector
-#else
-#define const_vector vector
-#endif
+#endif /* (__GNUC__ == 3 && __GNUC_MINOR__ < 3) */
-#endif /* _GCC_FIXES_ */
+#endif /* FFMPEG_GCC_FIXES_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c b/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c
index 42c936bb3..8151410d4 100644
--- a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c
@@ -20,24 +20,25 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
-#include "dsputil_altivec.h"
+#include "dsputil_ppc.h"
+#include "util_altivec.h"
/*
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
- to preserve proper dst alignement.
+ to preserve proper dst alignment.
*/
#define GMC1_PERF_COND (h==8)
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
{
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
- const unsigned short __attribute__ ((aligned(16))) rounder_a[8] =
+ const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
{rounder, rounder, rounder, rounder,
rounder, rounder, rounder, rounder};
- const unsigned short __attribute__ ((aligned(16))) ABCD[8] =
+ const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
{
(16-x16)*(16-y16), /* A */
( x16)*(16-y16), /* B */
@@ -45,8 +46,8 @@ POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
( x16)*( y16), /* D */
0, 0, 0, 0 /* padding */
};
- register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
- register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8);
+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+ register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
int i;
diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c
index bac620e82..c716b1e33 100644
--- a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c
@@ -18,11 +18,13 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
+#include "dsputil_ppc.h"
#include "dsputil_altivec.h"
+#include "util_altivec.h"
#include "types_altivec.h"
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
@@ -180,130 +182,124 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint
/* this code assume that stride % 16 == 0 */
void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
- signed int ABCD[4] __attribute__((aligned(16))) =
+ DECLARE_ALIGNED_16(signed int, ABCD[4]) =
{((8 - x) * (8 - y)),
- ((x) * (8 - y)),
- ((8 - x) * (y)),
- ((x) * (y))};
+ ((x) * (8 - y)),
+ ((8 - x) * (y)),
+ ((x) * (y))};
register int i;
- vector unsigned char fperm;
- const vector signed int vABCD = vec_ld(0, ABCD);
- const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
- const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
- const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
- const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
- const vector signed int vzero = vec_splat_s32(0);
- const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
- const vector unsigned short v6us = vec_splat_u16(6);
- register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
+ vec_u8_t fperm;
+ const vec_s32_t vABCD = vec_ld(0, ABCD);
+ const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
+ const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
+ const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
+ const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
+ LOAD_ZERO;
+ const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
+ const vec_u16_t v6us = vec_splat_u16(6);
+ register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
- vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
- vector unsigned char vsrc0uc, vsrc1uc;
- vector signed short vsrc0ssH, vsrc1ssH;
- vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
- vector signed short vsrc2ssH, vsrc3ssH, psum;
- vector unsigned char vdst, ppsum, fsum;
+ vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
+ vec_u8_t vsrc0uc, vsrc1uc;
+ vec_s16_t vsrc0ssH, vsrc1ssH;
+ vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
+ vec_s16_t vsrc2ssH, vsrc3ssH, psum;
+ vec_u8_t vdst, ppsum, fsum;
if (((unsigned long)dst) % 16 == 0) {
- fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
- 0x14, 0x15, 0x16, 0x17,
- 0x08, 0x09, 0x0A, 0x0B,
- 0x0C, 0x0D, 0x0E, 0x0F);
+ fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17,
+ 0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F);
} else {
- fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
- 0x04, 0x05, 0x06, 0x07,
- 0x18, 0x19, 0x1A, 0x1B,
- 0x1C, 0x1D, 0x1E, 0x1F);
+ fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07,
+ 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F);
}
vsrcAuc = vec_ld(0, src);
if (loadSecond)
- vsrcBuc = vec_ld(16, src);
+ vsrcBuc = vec_ld(16, src);
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
if (reallyBadAlign)
- vsrc1uc = vsrcBuc;
+ vsrc1uc = vsrcBuc;
else
- vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+ vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
- vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc0uc);
- vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc1uc);
+ vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
+ vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
if (!loadSecond) {// -> !reallyBadAlign
- for (i = 0 ; i < h ; i++) {
+ for (i = 0 ; i < h ; i++) {
- vsrcCuc = vec_ld(stride + 0, src);
+ vsrcCuc = vec_ld(stride + 0, src);
- vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
- vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+ vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+ vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
- vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc2uc);
- vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc3uc);
+ vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
+ vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
- psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
- psum = vec_mladd(vB, vsrc1ssH, psum);
- psum = vec_mladd(vC, vsrc2ssH, psum);
- psum = vec_mladd(vD, vsrc3ssH, psum);
- psum = vec_add(v28ss, psum);
- psum = vec_sra(psum, v6us);
+ psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
+ psum = vec_mladd(vB, vsrc1ssH, psum);
+ psum = vec_mladd(vC, vsrc2ssH, psum);
+ psum = vec_mladd(vD, vsrc3ssH, psum);
+ psum = vec_add(v28ss, psum);
+ psum = vec_sra(psum, v6us);
- vdst = vec_ld(0, dst);
- ppsum = (vector unsigned char)vec_packsu(psum, psum);
- fsum = vec_perm(vdst, ppsum, fperm);
+ vdst = vec_ld(0, dst);
+ ppsum = (vec_u8_t)vec_packsu(psum, psum);
+ fsum = vec_perm(vdst, ppsum, fperm);
- vec_st(fsum, 0, dst);
+ vec_st(fsum, 0, dst);
- vsrc0ssH = vsrc2ssH;
- vsrc1ssH = vsrc3ssH;
+ vsrc0ssH = vsrc2ssH;
+ vsrc1ssH = vsrc3ssH;
- dst += stride;
- src += stride;
- }
+ dst += stride;
+ src += stride;
+ }
} else {
- vector unsigned char vsrcDuc;
- for (i = 0 ; i < h ; i++) {
- vsrcCuc = vec_ld(stride + 0, src);
- vsrcDuc = vec_ld(stride + 16, src);
-
- vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
- if (reallyBadAlign)
- vsrc3uc = vsrcDuc;
- else
- vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
- vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc2uc);
- vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc3uc);
-
- psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
- psum = vec_mladd(vB, vsrc1ssH, psum);
- psum = vec_mladd(vC, vsrc2ssH, psum);
- psum = vec_mladd(vD, vsrc3ssH, psum);
- psum = vec_add(v28ss, psum);
- psum = vec_sr(psum, v6us);
-
- vdst = vec_ld(0, dst);
- ppsum = (vector unsigned char)vec_pack(psum, psum);
- fsum = vec_perm(vdst, ppsum, fperm);
-
- vec_st(fsum, 0, dst);
-
- vsrc0ssH = vsrc2ssH;
- vsrc1ssH = vsrc3ssH;
-
- dst += stride;
- src += stride;
- }
+ vec_u8_t vsrcDuc;
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(stride + 0, src);
+ vsrcDuc = vec_ld(stride + 16, src);
+
+ vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+ if (reallyBadAlign)
+ vsrc3uc = vsrcDuc;
+ else
+ vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
+
+ vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
+ vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
+
+ psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
+ psum = vec_mladd(vB, vsrc1ssH, psum);
+ psum = vec_mladd(vC, vsrc2ssH, psum);
+ psum = vec_mladd(vD, vsrc3ssH, psum);
+ psum = vec_add(v28ss, psum);
+ psum = vec_sr(psum, v6us);
+
+ vdst = vec_ld(0, dst);
+ ppsum = (vec_u8_t)vec_pack(psum, psum);
+ fsum = vec_perm(vdst, ppsum, fperm);
+
+ vec_st(fsum, 0, dst);
+
+ vsrc0ssH = vsrc2ssH;
+ vsrc1ssH = vsrc3ssH;
+
+ dst += stride;
+ src += stride;
+ }
}
}
@@ -312,7 +308,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h)
{
int i;
- vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+ vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
mask_ = vec_lvsl(0, src2);
@@ -354,7 +350,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h)
{
int i;
- vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
+ vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
mask_ = vec_lvsl(0, src2);
@@ -404,6 +400,82 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
* IDCT transform:
****************************************************************************/
+#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
+ /* 1st stage */ \
+ vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
+ vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
+ vz2 = vec_sra(vb1,vec_splat_u16(1)); \
+ vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
+ vz3 = vec_sra(vb3,vec_splat_u16(1)); \
+ vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
+ /* 2nd stage: output */ \
+ va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
+ va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
+ va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
+ va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
+
+#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
+ b0 = vec_mergeh( a0, a0 ); \
+ b1 = vec_mergeh( a1, a0 ); \
+ b2 = vec_mergeh( a2, a0 ); \
+ b3 = vec_mergeh( a3, a0 ); \
+ a0 = vec_mergeh( b0, b2 ); \
+ a1 = vec_mergel( b0, b2 ); \
+ a2 = vec_mergeh( b1, b3 ); \
+ a3 = vec_mergel( b1, b3 ); \
+ b0 = vec_mergeh( a0, a2 ); \
+ b1 = vec_mergel( a0, a2 ); \
+ b2 = vec_mergeh( a1, a3 ); \
+ b3 = vec_mergel( a1, a3 )
+
+#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
+ vdst_orig = vec_ld(0, dst); \
+ vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
+ vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \
+ va = vec_add(va, vdst_ss); \
+ va_u8 = vec_packsu(va, zero_s16v); \
+ va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
+ vec_ste(va_u32, element, (uint32_t*)dst);
+
+static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
+{
+ vec_s16_t va0, va1, va2, va3;
+ vec_s16_t vz0, vz1, vz2, vz3;
+ vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
+ vec_u8_t va_u8;
+ vec_u32_t va_u32;
+ vec_s16_t vdst_ss;
+ const vec_u16_t v6us = vec_splat_u16(6);
+ vec_u8_t vdst, vdst_orig;
+ vec_u8_t vdst_mask = vec_lvsl(0, dst);
+ int element = ((unsigned long)dst & 0xf) >> 2;
+ LOAD_ZERO;
+
+ block[0] += 32; /* add 32 as a DC-level for rounding */
+
+ vtmp0 = vec_ld(0,block);
+ vtmp1 = vec_sld(vtmp0, vtmp0, 8);
+ vtmp2 = vec_ld(16,block);
+ vtmp3 = vec_sld(vtmp2, vtmp2, 8);
+
+ VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
+ VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
+ VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
+
+ va0 = vec_sra(va0,v6us);
+ va1 = vec_sra(va1,v6us);
+ va2 = vec_sra(va2,v6us);
+ va3 = vec_sra(va3,v6us);
+
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
+ dst += stride;
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
+ dst += stride;
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
+ dst += stride;
+ VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
+}
+
#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
/* a0 = SRC(0) + SRC(4); */ \
vec_s16_t a0v = vec_add(s0, s4); \
@@ -491,8 +563,7 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
const vec_u16_t twov = vec_splat_u16(2);
const vec_u16_t sixv = vec_splat_u16(6);
- const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
- -1,-1,-1,-1,-1,-1,-1,-1);
+ const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
LOAD_ZERO;
dct[0] += 32; // rounding for the >>6 at the end
@@ -524,42 +595,310 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
}
+#define transpose4x16(r0, r1, r2, r3) { \
+ register vec_u8_t r4; \
+ register vec_u8_t r5; \
+ register vec_u8_t r6; \
+ register vec_u8_t r7; \
+ \
+ r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
+ r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
+ r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
+ r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
+ \
+ r0 = vec_mergeh(r4, r6); /*all set 0*/ \
+ r1 = vec_mergel(r4, r6); /*all set 1*/ \
+ r2 = vec_mergeh(r5, r7); /*all set 2*/ \
+ r3 = vec_mergel(r5, r7); /*all set 3*/ \
+}
+
+static inline void write16x4(uint8_t *dst, int dst_stride,
+ register vec_u8_t r0, register vec_u8_t r1,
+ register vec_u8_t r2, register vec_u8_t r3) {
+ DECLARE_ALIGNED_16(unsigned char, result[64]);
+ uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
+ int int_dst_stride = dst_stride/4;
+
+ vec_st(r0, 0, result);
+ vec_st(r1, 16, result);
+ vec_st(r2, 32, result);
+ vec_st(r3, 48, result);
+ /* FIXME: there has to be a better way!!!! */
+ *dst_int = *src_int;
+ *(dst_int+ int_dst_stride) = *(src_int + 1);
+ *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
+ *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
+ *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
+ *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
+ *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
+ *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
+ *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
+ *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
+ *(dst_int+10*int_dst_stride) = *(src_int + 10);
+ *(dst_int+11*int_dst_stride) = *(src_int + 11);
+ *(dst_int+12*int_dst_stride) = *(src_int + 12);
+ *(dst_int+13*int_dst_stride) = *(src_int + 13);
+ *(dst_int+14*int_dst_stride) = *(src_int + 14);
+ *(dst_int+15*int_dst_stride) = *(src_int + 15);
+}
+
+/** \brief performs a 6x16 transpose of data in src, and stores it to dst
+ \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
+ out of unaligned_load() */
+#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
+ register vec_u8_t r0 = unaligned_load(0, src); \
+ register vec_u8_t r1 = unaligned_load( src_stride, src); \
+ register vec_u8_t r2 = unaligned_load(2* src_stride, src); \
+ register vec_u8_t r3 = unaligned_load(3* src_stride, src); \
+ register vec_u8_t r4 = unaligned_load(4* src_stride, src); \
+ register vec_u8_t r5 = unaligned_load(5* src_stride, src); \
+ register vec_u8_t r6 = unaligned_load(6* src_stride, src); \
+ register vec_u8_t r7 = unaligned_load(7* src_stride, src); \
+ register vec_u8_t r14 = unaligned_load(14*src_stride, src); \
+ register vec_u8_t r15 = unaligned_load(15*src_stride, src); \
+ \
+ r8 = unaligned_load( 8*src_stride, src); \
+ r9 = unaligned_load( 9*src_stride, src); \
+ r10 = unaligned_load(10*src_stride, src); \
+ r11 = unaligned_load(11*src_stride, src); \
+ r12 = unaligned_load(12*src_stride, src); \
+ r13 = unaligned_load(13*src_stride, src); \
+ \
+ /*Merge first pairs*/ \
+ r0 = vec_mergeh(r0, r8); /*0, 8*/ \
+ r1 = vec_mergeh(r1, r9); /*1, 9*/ \
+ r2 = vec_mergeh(r2, r10); /*2,10*/ \
+ r3 = vec_mergeh(r3, r11); /*3,11*/ \
+ r4 = vec_mergeh(r4, r12); /*4,12*/ \
+ r5 = vec_mergeh(r5, r13); /*5,13*/ \
+ r6 = vec_mergeh(r6, r14); /*6,14*/ \
+ r7 = vec_mergeh(r7, r15); /*7,15*/ \
+ \
+ /*Merge second pairs*/ \
+ r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
+ r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
+ r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
+ r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
+ r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
+ r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
+ r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
+ r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
+ \
+ /*Third merge*/ \
+ r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
+ r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
+ r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
+ r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
+ r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
+ r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
+ /* Don't need to compute 3 and 7*/ \
+ \
+ /*Final merge*/ \
+ r8 = vec_mergeh(r0, r4); /*all set 0*/ \
+ r9 = vec_mergel(r0, r4); /*all set 1*/ \
+ r10 = vec_mergeh(r1, r5); /*all set 2*/ \
+ r11 = vec_mergel(r1, r5); /*all set 3*/ \
+ r12 = vec_mergeh(r2, r6); /*all set 4*/ \
+ r13 = vec_mergel(r2, r6); /*all set 5*/ \
+ /* Don't need to compute 14 and 15*/ \
+ \
+}
+
+// out: o = |x-y| < a
+static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
+ register vec_u8_t y,
+ register vec_u8_t a) {
+
+ register vec_u8_t diff = vec_subs(x, y);
+ register vec_u8_t diffneg = vec_subs(y, x);
+ register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
+ o = (vec_u8_t)vec_cmplt(o, a);
+ return o;
+}
+
+static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
+ register vec_u8_t p1,
+ register vec_u8_t q0,
+ register vec_u8_t q1,
+ register vec_u8_t alpha,
+ register vec_u8_t beta) {
+
+ register vec_u8_t mask;
+ register vec_u8_t tempmask;
+
+ mask = diff_lt_altivec(p0, q0, alpha);
+ tempmask = diff_lt_altivec(p1, p0, beta);
+ mask = vec_and(mask, tempmask);
+ tempmask = diff_lt_altivec(q1, q0, beta);
+ mask = vec_and(mask, tempmask);
+
+ return mask;
+}
+
+// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
+static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
+ register vec_u8_t p1,
+ register vec_u8_t p2,
+ register vec_u8_t q0,
+ register vec_u8_t tc0) {
+
+ register vec_u8_t average = vec_avg(p0, q0);
+ register vec_u8_t temp;
+ register vec_u8_t uncliped;
+ register vec_u8_t ones;
+ register vec_u8_t max;
+ register vec_u8_t min;
+ register vec_u8_t newp1;
+
+ temp = vec_xor(average, p2);
+ average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
+ ones = vec_splat_u8(1);
+ temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
+ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
+ max = vec_adds(p1, tc0);
+ min = vec_subs(p1, tc0);
+ newp1 = vec_max(min, uncliped);
+ newp1 = vec_min(max, newp1);
+ return newp1;
+}
+
+#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
+ \
+ const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
+ \
+ register vec_u8_t pq0bit = vec_xor(p0,q0); \
+ register vec_u8_t q1minus; \
+ register vec_u8_t p0minus; \
+ register vec_u8_t stage1; \
+ register vec_u8_t stage2; \
+ register vec_u8_t vec160; \
+ register vec_u8_t delta; \
+ register vec_u8_t deltaneg; \
+ \
+ q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
+ stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
+ stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
+ p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
+ stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
+ pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
+ stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
+ stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
+ vec160 = vec_ld(0, &A0v); \
+ deltaneg = vec_subs(vec160, stage2); /* -d */ \
+ delta = vec_subs(stage2, vec160); /* d */ \
+ deltaneg = vec_min(tc0masked, deltaneg); \
+ delta = vec_min(tc0masked, delta); \
+ p0 = vec_subs(p0, deltaneg); \
+ q0 = vec_subs(q0, delta); \
+ p0 = vec_adds(p0, delta); \
+ q0 = vec_adds(q0, deltaneg); \
+}
+
+#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
+ DECLARE_ALIGNED_16(unsigned char, temp[16]); \
+ register vec_u8_t alphavec; \
+ register vec_u8_t betavec; \
+ register vec_u8_t mask; \
+ register vec_u8_t p1mask; \
+ register vec_u8_t q1mask; \
+ register vector signed char tc0vec; \
+ register vec_u8_t finaltc0; \
+ register vec_u8_t tc0masked; \
+ register vec_u8_t newp1; \
+ register vec_u8_t newq1; \
+ \
+ temp[0] = alpha; \
+ temp[1] = beta; \
+ alphavec = vec_ld(0, temp); \
+ betavec = vec_splat(alphavec, 0x1); \
+ alphavec = vec_splat(alphavec, 0x0); \
+ mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
+ \
+ *((int *)temp) = *((int *)tc0); \
+ tc0vec = vec_ld(0, (signed char*)temp); \
+ tc0vec = vec_mergeh(tc0vec, tc0vec); \
+ tc0vec = vec_mergeh(tc0vec, tc0vec); \
+ mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
+ finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \
+ \
+ p1mask = diff_lt_altivec(p2, p0, betavec); \
+ p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \
+ tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \
+ finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
+ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
+ /*end if*/ \
+ \
+ q1mask = diff_lt_altivec(q2, q0, betavec); \
+ q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
+ tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \
+ finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
+ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
+ /*end if*/ \
+ \
+ h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
+ p1 = newp1; \
+ q1 = newq1; \
+}
+
+static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
+
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
+ register vec_u8_t p2 = vec_ld(-3*stride, pix);
+ register vec_u8_t p1 = vec_ld(-2*stride, pix);
+ register vec_u8_t p0 = vec_ld(-1*stride, pix);
+ register vec_u8_t q0 = vec_ld(0, pix);
+ register vec_u8_t q1 = vec_ld(stride, pix);
+ register vec_u8_t q2 = vec_ld(2*stride, pix);
+ h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
+ vec_st(p1, -2*stride, pix);
+ vec_st(p0, -1*stride, pix);
+ vec_st(q0, 0, pix);
+ vec_st(q1, stride, pix);
+ }
+}
+
+static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
+
+ register vec_u8_t line0, line1, line2, line3, line4, line5;
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
+ return;
+ readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
+ h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
+ transpose4x16(line1, line2, line3, line4);
+ write16x4(pix-2, stride, line1, line2, line3, line4);
+}
+
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
-#ifdef HAVE_ALTIVEC
- if (has_altivec()) {
- c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
- c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
- c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
- c->h264_idct8_add = ff_h264_idct8_add_altivec;
+ if (has_altivec()) {
+ c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
+ c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
+ c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
+ c->h264_idct_add = ff_h264_idct_add_altivec;
+ c->h264_idct8_add = ff_h264_idct8_add_altivec;
+ c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
+ c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
#define dspfunc(PFX, IDX, NUM) \
- c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
- c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
- c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
- c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
- c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
- c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
- c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
- c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
- c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
- c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
- c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
- c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
- c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
- c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
- c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
- c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
-
- dspfunc(put_h264_qpel, 0, 16);
- dspfunc(avg_h264_qpel, 0, 16);
+ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
+ c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
+ c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
+ c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
+ c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
+ c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
+ c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
+ c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
+
+ dspfunc(put_h264_qpel, 0, 16);
+ dspfunc(avg_h264_qpel, 0, 16);
#undef dspfunc
-
- } else
-#endif /* HAVE_ALTIVEC */
- {
- // Non-AltiVec PPC optimisations
-
- // ... pending ...
- }
+ }
}
diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c
index e8ad67f2f..d8ad96419 100644
--- a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c
@@ -18,186 +18,227 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+//#define DEBUG_ALIGNMENT
+#ifdef DEBUG_ALIGNMENT
+#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
+#else
+#define ASSERT_ALIGNED(ptr) ;
+#endif
+
/* this code assume that stride % 16 == 0 */
-void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
+
+#define CHROMA_MC8_ALTIVEC_CORE \
+ vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
+ vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
+\
+ psum = vec_mladd(vA, vsrc0ssH, v32ss);\
+ psum = vec_mladd(vB, vsrc1ssH, psum);\
+ psum = vec_mladd(vC, vsrc2ssH, psum);\
+ psum = vec_mladd(vD, vsrc3ssH, psum);\
+ psum = vec_sr(psum, v6us);\
+\
+ vdst = vec_ld(0, dst);\
+ ppsum = (vec_u8_t)vec_pack(psum, psum);\
+ vfdst = vec_perm(vdst, ppsum, fperm);\
+\
+ OP_U8_ALTIVEC(fsum, vfdst, vdst);\
+\
+ vec_st(fsum, 0, dst);\
+\
+ vsrc0ssH = vsrc2ssH;\
+ vsrc1ssH = vsrc3ssH;\
+\
+ dst += stride;\
+ src += stride;
+
+#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
+\
+ vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
+ vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
+\
+ psum = vec_mladd(vA, vsrc0ssH, v32ss);\
+ psum = vec_mladd(vE, vsrc1ssH, psum);\
+ psum = vec_sr(psum, v6us);\
+\
+ vdst = vec_ld(0, dst);\
+ ppsum = (vec_u8_t)vec_pack(psum, psum);\
+ vfdst = vec_perm(vdst, ppsum, fperm);\
+\
+ OP_U8_ALTIVEC(fsum, vfdst, vdst);\
+\
+ vec_st(fsum, 0, dst);\
+\
+ dst += stride;\
+ src += stride;
+
+void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
+ int stride, int h, int x, int y) {
POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
- signed int ABCD[4] __attribute__((aligned(16))) =
+ DECLARE_ALIGNED_16(signed int, ABCD[4]) =
{((8 - x) * (8 - y)),
- ((x) * (8 - y)),
- ((8 - x) * (y)),
- ((x) * (y))};
+ (( x) * (8 - y)),
+ ((8 - x) * ( y)),
+ (( x) * ( y))};
register int i;
- vector unsigned char fperm;
- const vector signed int vABCD = vec_ld(0, ABCD);
- const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
- const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
- const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
- const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
- const vector signed int vzero = vec_splat_s32(0);
- const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
- const vector unsigned short v6us = vec_splat_u16(6);
+ vec_u8_t fperm;
+ const vec_s32_t vABCD = vec_ld(0, ABCD);
+ const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
+ const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
+ const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
+ const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
+ LOAD_ZERO;
+ const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
+ const vec_u16_t v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
- vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
- vector unsigned char vsrc0uc, vsrc1uc;
- vector signed short vsrc0ssH, vsrc1ssH;
- vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
- vector signed short vsrc2ssH, vsrc3ssH, psum;
- vector unsigned char vdst, ppsum, vfdst, fsum;
+ vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
+ vec_u8_t vsrc0uc, vsrc1uc;
+ vec_s16_t vsrc0ssH, vsrc1ssH;
+ vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
+ vec_s16_t vsrc2ssH, vsrc3ssH, psum;
+ vec_u8_t vdst, ppsum, vfdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
if (((unsigned long)dst) % 16 == 0) {
- fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
- 0x14, 0x15, 0x16, 0x17,
- 0x08, 0x09, 0x0A, 0x0B,
- 0x0C, 0x0D, 0x0E, 0x0F);
+ fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17,
+ 0x08, 0x09, 0x0A, 0x0B,
+ 0x0C, 0x0D, 0x0E, 0x0F);
} else {
- fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
- 0x04, 0x05, 0x06, 0x07,
- 0x18, 0x19, 0x1A, 0x1B,
- 0x1C, 0x1D, 0x1E, 0x1F);
+ fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07,
+ 0x18, 0x19, 0x1A, 0x1B,
+ 0x1C, 0x1D, 0x1E, 0x1F);
}
vsrcAuc = vec_ld(0, src);
if (loadSecond)
- vsrcBuc = vec_ld(16, src);
+ vsrcBuc = vec_ld(16, src);
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
if (reallyBadAlign)
- vsrc1uc = vsrcBuc;
+ vsrc1uc = vsrcBuc;
else
- vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
-
- vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc0uc);
- vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc1uc);
-
- if (!loadSecond) {// -> !reallyBadAlign
- for (i = 0 ; i < h ; i++) {
-
-
- vsrcCuc = vec_ld(stride + 0, src);
-
- vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
- vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
-
- vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc2uc);
- vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc3uc);
-
- psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
- psum = vec_mladd(vB, vsrc1ssH, psum);
- psum = vec_mladd(vC, vsrc2ssH, psum);
- psum = vec_mladd(vD, vsrc3ssH, psum);
- psum = vec_add(v32ss, psum);
- psum = vec_sra(psum, v6us);
-
- vdst = vec_ld(0, dst);
- ppsum = (vector unsigned char)vec_packsu(psum, psum);
- vfdst = vec_perm(vdst, ppsum, fperm);
-
- OP_U8_ALTIVEC(fsum, vfdst, vdst);
-
- vec_st(fsum, 0, dst);
-
- vsrc0ssH = vsrc2ssH;
- vsrc1ssH = vsrc3ssH;
-
- dst += stride;
- src += stride;
- }
+ vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
+
+ vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
+ vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
+
+ if (ABCD[3]) {
+ if (!loadSecond) {// -> !reallyBadAlign
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(stride + 0, src);
+ vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+ vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+
+ CHROMA_MC8_ALTIVEC_CORE
+ }
+ } else {
+ vec_u8_t vsrcDuc;
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(stride + 0, src);
+ vsrcDuc = vec_ld(stride + 16, src);
+ vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+ if (reallyBadAlign)
+ vsrc3uc = vsrcDuc;
+ else
+ vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
+
+ CHROMA_MC8_ALTIVEC_CORE
+ }
+ }
} else {
- vector unsigned char vsrcDuc;
- for (i = 0 ; i < h ; i++) {
- vsrcCuc = vec_ld(stride + 0, src);
- vsrcDuc = vec_ld(stride + 16, src);
-
- vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
- if (reallyBadAlign)
- vsrc3uc = vsrcDuc;
- else
- vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
-
- vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc2uc);
- vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
- (vector unsigned char)vsrc3uc);
-
- psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
- psum = vec_mladd(vB, vsrc1ssH, psum);
- psum = vec_mladd(vC, vsrc2ssH, psum);
- psum = vec_mladd(vD, vsrc3ssH, psum);
- psum = vec_add(v32ss, psum);
- psum = vec_sr(psum, v6us);
-
- vdst = vec_ld(0, dst);
- ppsum = (vector unsigned char)vec_pack(psum, psum);
- vfdst = vec_perm(vdst, ppsum, fperm);
-
- OP_U8_ALTIVEC(fsum, vfdst, vdst);
-
- vec_st(fsum, 0, dst);
-
- vsrc0ssH = vsrc2ssH;
- vsrc1ssH = vsrc3ssH;
-
- dst += stride;
- src += stride;
- }
+ const vec_s16_t vE = vec_add(vB, vC);
+ if (ABCD[2]) { // x == 0 B == 0
+ if (!loadSecond) {// -> !reallyBadAlign
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(stride + 0, src);
+ vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+ CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+
+ vsrc0uc = vsrc1uc;
+ }
+ } else {
+ vec_u8_t vsrcDuc;
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(stride + 0, src);
+ vsrcDuc = vec_ld(stride + 15, src);
+ vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+ CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+
+ vsrc0uc = vsrc1uc;
+ }
+ }
+ } else { // y == 0 C == 0
+ if (!loadSecond) {// -> !reallyBadAlign
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(0, src);
+ vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
+ vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
+
+ CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+ }
+ } else {
+ vec_u8_t vsrcDuc;
+ for (i = 0 ; i < h ; i++) {
+ vsrcCuc = vec_ld(0, src);
+ vsrcDuc = vec_ld(15, src);
+ vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
+ if (reallyBadAlign)
+ vsrc1uc = vsrcDuc;
+ else
+ vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
+
+ CHROMA_MC8_ALTIVEC_CORE_SIMPLE
+ }
+ }
+ }
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
}
+#undef CHROMA_MC8_ALTIVEC_CORE
+
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
register int i;
- const vector signed int vzero = vec_splat_s32(0);
- const vector unsigned char permM2 = vec_lvsl(-2, src);
- const vector unsigned char permM1 = vec_lvsl(-1, src);
- const vector unsigned char permP0 = vec_lvsl(+0, src);
- const vector unsigned char permP1 = vec_lvsl(+1, src);
- const vector unsigned char permP2 = vec_lvsl(+2, src);
- const vector unsigned char permP3 = vec_lvsl(+3, src);
- const vector signed short v5ss = vec_splat_s16(5);
- const vector unsigned short v5us = vec_splat_u16(5);
- const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
- const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
- const vector unsigned char dstperm = vec_lvsr(0, dst);
- const vector unsigned char neg1 =
- (const vector unsigned char) vec_splat_s8(-1);
-
- const vector unsigned char dstmask =
- vec_perm((const vector unsigned char)vzero,
- neg1, dstperm);
-
- vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+ LOAD_ZERO;
+ const vec_u8_t permM2 = vec_lvsl(-2, src);
+ const vec_u8_t permM1 = vec_lvsl(-1, src);
+ const vec_u8_t permP0 = vec_lvsl(+0, src);
+ const vec_u8_t permP1 = vec_lvsl(+1, src);
+ const vec_u8_t permP2 = vec_lvsl(+2, src);
+ const vec_u8_t permP3 = vec_lvsl(+3, src);
+ const vec_s16_t v5ss = vec_splat_s16(5);
+ const vec_u16_t v5us = vec_splat_u16(5);
+ const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+ const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
+
+ vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16);
- vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
+ vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
- vector unsigned char sum, dst1, dst2, vdst, fsum,
- rsum, fdst1, fdst2;
+ vec_u8_t sum, vdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) {
- vector unsigned char srcR1 = vec_ld(-2, src);
- vector unsigned char srcR2 = vec_ld(14, src);
+ vec_u8_t srcR1 = vec_ld(-2, src);
+ vec_u8_t srcR2 = vec_ld(14, src);
switch (align) {
default: {
@@ -217,7 +258,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = srcR2;
} break;
case 12: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -226,7 +267,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -235,7 +276,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
@@ -244,7 +285,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
@@ -254,32 +295,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
} break;
}
- srcP0A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP0);
- srcP0B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP0);
- srcP1A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP1);
- srcP1B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP1);
-
- srcP2A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP2);
- srcP2B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP2);
- srcP3A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP3);
- srcP3B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP3);
-
- srcM1A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcM1);
- srcM1B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcM1);
- srcM2A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcM2);
- srcM2B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcM2);
+ srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
+ srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
+ srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
+ srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
+
+ srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
+ srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
+ srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
+ srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
+
+ srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
+ srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
+ srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
+ srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
@@ -291,8 +320,8 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
- pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
- pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
+ pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
+ pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
@@ -305,18 +334,12 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
sum = vec_packsu(sumA, sumB);
- dst1 = vec_ld(0, dst);
- dst2 = vec_ld(16, dst);
- vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
+ ASSERT_ALIGNED(dst);
+ vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
- rsum = vec_perm(fsum, fsum, dstperm);
- fdst1 = vec_sel(dst1, rsum, dstmask);
- fdst2 = vec_sel(rsum, dst2, dstmask);
-
- vec_st(fdst1, 0, dst);
- vec_st(fdst2, 16, dst);
+ vec_st(fsum, 0, dst);
src += srcStride;
dst += dstStride;
@@ -330,67 +353,53 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
register int i;
- const vector signed int vzero = vec_splat_s32(0);
- const vector unsigned char perm = vec_lvsl(0, src);
- const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
- const vector unsigned short v5us = vec_splat_u16(5);
- const vector signed short v5ss = vec_splat_s16(5);
- const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
- const vector unsigned char dstperm = vec_lvsr(0, dst);
- const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
- const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
+ LOAD_ZERO;
+ const vec_u8_t perm = vec_lvsl(0, src);
+ const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+ const vec_u16_t v5us = vec_splat_u16(5);
+ const vec_s16_t v5ss = vec_splat_s16(5);
+ const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
uint8_t *srcbis = src - (srcStride * 2);
- const vector unsigned char srcM2a = vec_ld(0, srcbis);
- const vector unsigned char srcM2b = vec_ld(16, srcbis);
- const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
+ const vec_u8_t srcM2a = vec_ld(0, srcbis);
+ const vec_u8_t srcM2b = vec_ld(16, srcbis);
+ const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
// srcbis += srcStride;
- const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
- const vector unsigned char srcM1b = vec_ld(16, srcbis);
- const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
+ const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
+ const vec_u8_t srcM1b = vec_ld(16, srcbis);
+ const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
// srcbis += srcStride;
- const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
- const vector unsigned char srcP0b = vec_ld(16, srcbis);
- const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
+ const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
+ const vec_u8_t srcP0b = vec_ld(16, srcbis);
+ const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
// srcbis += srcStride;
- const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
- const vector unsigned char srcP1b = vec_ld(16, srcbis);
- const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
+ const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
+ const vec_u8_t srcP1b = vec_ld(16, srcbis);
+ const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
// srcbis += srcStride;
- const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
- const vector unsigned char srcP2b = vec_ld(16, srcbis);
- const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
+ const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
+ const vec_u8_t srcP2b = vec_ld(16, srcbis);
+ const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
// srcbis += srcStride;
- vector signed short srcM2ssA = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcM2);
- vector signed short srcM2ssB = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcM2);
- vector signed short srcM1ssA = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcM1);
- vector signed short srcM1ssB = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcM1);
- vector signed short srcP0ssA = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP0);
- vector signed short srcP0ssB = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP0);
- vector signed short srcP1ssA = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP1);
- vector signed short srcP1ssB = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP1);
- vector signed short srcP2ssA = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP2);
- vector signed short srcP2ssB = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP2);
-
- vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
+ vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
+ vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
+ vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
+ vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
+ vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
+ vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
+ vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
+ vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
+ vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
+ vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
+
+ vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
- vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
- srcP3a, srcP3b, srcP3;
+ vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
@@ -398,10 +407,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm);
- srcP3ssA = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP3);
- srcP3ssB = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP3);
+ srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
+ srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
// srcbis += srcStride;
sum1A = vec_adds(srcP0ssA, srcP1ssA);
@@ -425,8 +432,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
- pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
- pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
+ pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
+ pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
@@ -439,18 +446,12 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
sum = vec_packsu(sumA, sumB);
- dst1 = vec_ld(0, dst);
- dst2 = vec_ld(16, dst);
- vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
+ ASSERT_ALIGNED(dst);
+ vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
- rsum = vec_perm(fsum, fsum, dstperm);
- fdst1 = vec_sel(dst1, rsum, dstmask);
- fdst2 = vec_sel(rsum, dst2, dstmask);
-
- vec_st(fdst1, 0, dst);
- vec_st(fdst2, 16, dst);
+ vec_st(fsum, 0, dst);
dst += dstStride;
}
@@ -461,58 +462,50 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i;
- const vector signed int vzero = vec_splat_s32(0);
- const vector unsigned char permM2 = vec_lvsl(-2, src);
- const vector unsigned char permM1 = vec_lvsl(-1, src);
- const vector unsigned char permP0 = vec_lvsl(+0, src);
- const vector unsigned char permP1 = vec_lvsl(+1, src);
- const vector unsigned char permP2 = vec_lvsl(+2, src);
- const vector unsigned char permP3 = vec_lvsl(+3, src);
- const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
- const vector unsigned int v10ui = vec_splat_u32(10);
- const vector signed short v5ss = vec_splat_s16(5);
- const vector signed short v1ss = vec_splat_s16(1);
- const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
- const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
+ LOAD_ZERO;
+ const vec_u8_t permM2 = vec_lvsl(-2, src);
+ const vec_u8_t permM1 = vec_lvsl(-1, src);
+ const vec_u8_t permP0 = vec_lvsl(+0, src);
+ const vec_u8_t permP1 = vec_lvsl(+1, src);
+ const vec_u8_t permP2 = vec_lvsl(+2, src);
+ const vec_u8_t permP3 = vec_lvsl(+3, src);
+ const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
+ const vec_u32_t v10ui = vec_splat_u32(10);
+ const vec_s16_t v5ss = vec_splat_s16(5);
+ const vec_s16_t v1ss = vec_splat_s16(1);
+ const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
+ const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16);
- const vector unsigned char neg1 = (const vector unsigned char)
- vec_splat_s8(-1);
-
- vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
+ vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
- const vector unsigned char dstperm = vec_lvsr(0, dst);
-
- const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
-
- const vector unsigned char mperm = (const vector unsigned char)
+ const vec_u8_t mperm = (const vec_u8_t)
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
int16_t *tmpbis = tmp;
- vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
+ vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB;
- vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
+ vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo;
- vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
- rsum, fdst1, fdst2;
- vector signed short ssume, ssumo;
+ vec_u8_t fsum, sumv, sum, vdst;
+ vec_s16_t ssume, ssumo;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) {
- vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
- vector unsigned char srcR1 = vec_ld(-2, src);
- vector unsigned char srcR2 = vec_ld(14, src);
+ vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
+ vec_u8_t srcR1 = vec_ld(-2, src);
+ vec_u8_t srcR2 = vec_ld(14, src);
switch (align) {
default: {
@@ -532,7 +525,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = srcR2;
} break;
case 12: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -541,7 +534,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -550,7 +543,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
@@ -559,7 +552,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
- vector unsigned char srcR3 = vec_ld(30, src);
+ vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
@@ -569,32 +562,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
} break;
}
- srcP0A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP0);
- srcP0B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP0);
- srcP1A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP1);
- srcP1B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP1);
-
- srcP2A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP2);
- srcP2B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP2);
- srcP3A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcP3);
- srcP3B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcP3);
-
- srcM1A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcM1);
- srcM1B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcM1);
- srcM2A = (vector signed short)
- vec_mergeh((vector unsigned char)vzero, srcM2);
- srcM2B = (vector signed short)
- vec_mergel((vector unsigned char)vzero, srcM2);
+ srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
+ srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
+ srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
+ srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
+
+ srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
+ srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
+ srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
+ srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
+
+ srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
+ srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
+ srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
+ srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
@@ -606,8 +587,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
- pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
- pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
+ pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
+ pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B);
@@ -636,15 +617,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) {
- const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
- const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
+ const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
+ const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
- const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
- const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
- const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
- const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
- const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
- const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
+ const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
+ const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
+ const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
+ const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
+ const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
+ const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride;
@@ -669,9 +650,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss);
- pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
+ pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss);
- pp3Be = vec_sra((vector signed int)sum3B, v16ui);
+ pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss);
pp1cAe = vec_add(pp1Ae, v512si);
@@ -700,18 +681,12 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
sumv = vec_packsu(ssume, ssumo);
sum = vec_perm(sumv, sumv, mperm);
- dst1 = vec_ld(0, dst);
- dst2 = vec_ld(16, dst);
- vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
+ ASSERT_ALIGNED(dst);
+ vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
- rsum = vec_perm(fsum, fsum, dstperm);
- fdst1 = vec_sel(dst1, rsum, dstmask);
- fdst2 = vec_sel(rsum, dst2, dstmask);
-
- vec_st(fdst1, 0, dst);
- vec_st(fdst2, 16, dst);
+ vec_st(fsum, 0, dst);
dst += dstStride;
}
diff --git a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c
index 66c8082f7..37b2f62c3 100644
--- a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c
@@ -16,7 +16,6 @@
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
*/
/*
@@ -39,14 +38,14 @@
#include <stdlib.h> /* malloc(), free() */
#include <string.h>
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
-#include "dsputil_altivec.h"
+#include "dsputil_ppc.h"
#define vector_s16_t vector signed short
-#define const_vector_s16_t const_vector signed short
+#define const_vector_s16_t const vector signed short
#define vector_u16_t vector unsigned short
#define vector_s8_t vector signed char
#define vector_u8_t vector unsigned char
diff --git a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c
new file mode 100644
index 000000000..3b161c5a6
--- /dev/null
+++ b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c
@@ -0,0 +1,153 @@
+/*
+ * High quality image resampling with polyphase filters
+ * Copyright (c) 2001 Fabrice Bellard.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file imgresample_altivec.c
+ * High quality image resampling with polyphase filters - AltiVec bits
+ */
+
+#include "gcc_fixes.h"
+
+typedef union {
+ vector unsigned char v;
+ unsigned char c[16];
+} vec_uc_t;
+
+typedef union {
+ vector signed short v;
+ signed short s[8];
+} vec_ss_t;
+
+void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
+ int wrap, int16_t *filter)
+{
+ int sum, i;
+ const uint8_t *s;
+ vector unsigned char *tv, tmp, dstv, zero;
+ vec_ss_t srchv[4], srclv[4], fv[4];
+ vector signed short zeros, sumhv, sumlv;
+ s = src;
+
+ for(i=0;i<4;i++)
+ {
+ /*
+ The vec_madds later on does an implicit >>15 on the result.
+ Since FILTER_BITS is 8, and we have 15 bits of magnitude in
+ a signed short, we have just enough bits to pre-shift our
+ filter constants <<7 to compensate for vec_madds.
+ */
+ fv[i].s[0] = filter[i] << (15-FILTER_BITS);
+ fv[i].v = vec_splat(fv[i].v, 0);
+ }
+
+ zero = vec_splat_u8(0);
+ zeros = vec_splat_s16(0);
+
+
+ /*
+ When we're resampling, we'd ideally like both our input buffers,
+ and output buffers to be 16-byte aligned, so we can do both aligned
+ reads and writes. Sadly we can't always have this at the moment, so
+ we opt for aligned writes, as unaligned writes have a huge overhead.
+ To do this, do enough scalar resamples to get dst 16-byte aligned.
+ */
+ i = (-(int)dst) & 0xf;
+ while(i>0) {
+ sum = s[0 * wrap] * filter[0] +
+ s[1 * wrap] * filter[1] +
+ s[2 * wrap] * filter[2] +
+ s[3 * wrap] * filter[3];
+ sum = sum >> FILTER_BITS;
+ if (sum<0) sum = 0; else if (sum>255) sum=255;
+ dst[0] = sum;
+ dst++;
+ s++;
+ dst_width--;
+ i--;
+ }
+
+ /* Do our altivec resampling on 16 pixels at once. */
+ while(dst_width>=16) {
+ /*
+ Read 16 (potentially unaligned) bytes from each of
+ 4 lines into 4 vectors, and split them into shorts.
+ Interleave the multipy/accumulate for the resample
+ filter with the loads to hide the 3 cycle latency
+ the vec_madds have.
+ */
+ tv = (vector unsigned char *) &s[0 * wrap];
+ tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
+ srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
+ srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
+ sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
+ sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
+
+ tv = (vector unsigned char *) &s[1 * wrap];
+ tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
+ srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
+ srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
+ sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
+ sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
+
+ tv = (vector unsigned char *) &s[2 * wrap];
+ tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
+ srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
+ srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
+ sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
+ sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
+
+ tv = (vector unsigned char *) &s[3 * wrap];
+ tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
+ srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
+ srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
+ sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
+ sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
+
+ /*
+ Pack the results into our destination vector,
+ and do an aligned write of that back to memory.
+ */
+ dstv = vec_packsu(sumhv, sumlv) ;
+ vec_st(dstv, 0, (vector unsigned char *) dst);
+
+ dst+=16;
+ s+=16;
+ dst_width-=16;
+ }
+
+ /*
+ If there are any leftover pixels, resample them
+ with the slow scalar method.
+ */
+ while(dst_width>0) {
+ sum = s[0 * wrap] * filter[0] +
+ s[1 * wrap] * filter[1] +
+ s[2 * wrap] * filter[2] +
+ s[3 * wrap] * filter[3];
+ sum = sum >> FILTER_BITS;
+ if (sum<0) sum = 0; else if (sum>255) sum=255;
+ dst[0] = sum;
+ dst++;
+ s++;
+ dst_width--;
+ }
+}
+
diff --git a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h
new file mode 100644
index 000000000..538c1bee6
--- /dev/null
+++ b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef FFMPEG_IMGRESAMPLE_ALTIVEC_H
+#define FFMPEG_IMGRESAMPLE_ALTIVEC_H
+
+#include <stdint.h>
+
+void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
+ int wrap, int16_t *filter);
+#endif /* FFMPEG_IMGRESAMPLE_ALTIVEC_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/int_altivec.c b/contrib/ffmpeg/libavcodec/ppc/int_altivec.c
new file mode 100644
index 000000000..95497c99a
--- /dev/null
+++ b/contrib/ffmpeg/libavcodec/ppc/int_altivec.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ ** @file int_altivec.c
+ ** integer misc ops.
+ **/
+
+#include "dsputil.h"
+
+#include "gcc_fixes.h"
+
+#include "dsputil_altivec.h"
+
+static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
+ int size) {
+ int i, size16;
+ vector signed char vpix1;
+ vector signed short vpix2, vdiff, vpix1l,vpix1h;
+ union { vector signed int vscore;
+ int32_t score[4];
+ } u;
+ u.vscore = vec_splat_s32(0);
+//
+//XXX lazy way, fix it later
+
+#define vec_unaligned_load(b) \
+ vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
+
+ size16 = size >> 4;
+ while(size16) {
+// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
+ //load pix1 and the first batch of pix2
+
+ vpix1 = vec_unaligned_load(pix1);
+ vpix2 = vec_unaligned_load(pix2);
+ pix2 += 8;
+ //unpack
+ vpix1h = vec_unpackh(vpix1);
+ vdiff = vec_sub(vpix1h, vpix2);
+ vpix1l = vec_unpackl(vpix1);
+ // load another batch from pix2
+ vpix2 = vec_unaligned_load(pix2);
+ u.vscore = vec_msum(vdiff, vdiff, u.vscore);
+ vdiff = vec_sub(vpix1l, vpix2);
+ u.vscore = vec_msum(vdiff, vdiff, u.vscore);
+ pix1 += 16;
+ pix2 += 8;
+ size16--;
+ }
+ u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
+
+ size %= 16;
+ for (i = 0; i < size; i++) {
+ u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
+ }
+ return u.score[3];
+}
+
+void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
+{
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
+}
diff --git a/contrib/ffmpeg/libavcodec/ppc/mathops.h b/contrib/ffmpeg/libavcodec/ppc/mathops.h
index 6af23f246..d7cc85365 100644
--- a/contrib/ffmpeg/libavcodec/ppc/mathops.h
+++ b/contrib/ffmpeg/libavcodec/ppc/mathops.h
@@ -20,6 +20,9 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#ifndef FFMPEG_PPC_MATHOPS_H
+#define FFMPEG_PPC_MATHOPS_H
+
#if defined(ARCH_POWERPC_405)
/* signed 16x16 -> 32 multiply add accumulate */
# define MAC16(rt, ra, rb) \
@@ -31,3 +34,5 @@
asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb));
__rt; })
#endif
+
+#endif /* FFMPEG_PPC_MATHOPS_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c
index 3822cb20e..a2ba5e125 100644
--- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c
@@ -23,13 +23,13 @@
#include <stdlib.h>
#include <stdio.h>
-#include "../dsputil.h"
-#include "../mpegvideo.h"
+#include "dsputil.h"
+#include "mpegvideo.h"
#include "gcc_fixes.h"
-#include "dsputil_altivec.h"
-
+#include "dsputil_ppc.h"
+#include "util_altivec.h"
// Swaps two variables (used for altivec registers)
#define SWAP(a,b) \
do { \
@@ -66,12 +66,8 @@ do { \
}
-#ifdef CONFIG_DARWIN
-#define FOUROF(a) (a)
-#else
-// slower, for dumb non-apple GCC
-#define FOUROF(a) {a,a,a,a}
-#endif
+#define FOUROF(a) AVV(a,a,a,a)
+
int dct_quantize_altivec(MpegEncContext* s,
DCTELEM* data, int n,
int qscale, int* overflow)
@@ -79,8 +75,8 @@ int dct_quantize_altivec(MpegEncContext* s,
int lastNonZero;
vector float row0, row1, row2, row3, row4, row5, row6, row7;
vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
- const_vector float zero = (const_vector float)FOUROF(0.);
- // used after quantise step
+ const vector float zero = (const vector float)FOUROF(0.);
+ // used after quantize step
int oldBaseValue = 0;
// Load the data into the row/alt vectors
@@ -258,7 +254,7 @@ int dct_quantize_altivec(MpegEncContext* s,
}
}
- // perform the quantise step, using the floating point data
+ // perform the quantize step, using the floating point data
// still in the row/alt registers
{
const int* biasAddr;
@@ -474,7 +470,7 @@ int dct_quantize_altivec(MpegEncContext* s,
data[0] = (oldBaseValue + 4) >> 3;
}
- // We handled the tranpose permutation above and we don't
+ // We handled the transpose permutation above and we don't
// need to permute the "no" permutation case.
if ((lastNonZero > 0) &&
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
@@ -486,7 +482,6 @@ int dct_quantize_altivec(MpegEncContext* s,
return lastNonZero;
}
-#undef FOUROF
/*
AltiVec version of dct_unquantize_h263
@@ -515,25 +510,25 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
}else
qadd = 0;
i = 1;
- nCoeffs= 63; //does not allways use zigzag table
+ nCoeffs= 63; //does not always use zigzag table
} else {
i = 0;
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
}
{
- register const_vector signed short vczero = (const_vector signed short)vec_splat_s16(0);
- short __attribute__ ((aligned(16))) qmul8[] =
+ register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
+ DECLARE_ALIGNED_16(short, qmul8[]) =
{
qmul, qmul, qmul, qmul,
qmul, qmul, qmul, qmul
};
- short __attribute__ ((aligned(16))) qadd8[] =
+ DECLARE_ALIGNED_16(short, qadd8[]) =
{
qadd, qadd, qadd, qadd,
qadd, qadd, qadd, qadd
};
- short __attribute__ ((aligned(16))) nqadd8[] =
+ DECLARE_ALIGNED_16(short, nqadd8[]) =
{
-qadd, -qadd, -qadd, -qadd,
-qadd, -qadd, -qadd, -qadd
@@ -601,3 +596,50 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
}
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
}
+
+
+extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
+extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
+
+void MPV_common_init_altivec(MpegEncContext *s)
+{
+ if ((mm_flags & MM_ALTIVEC) == 0) return;
+
+ if (s->avctx->lowres==0)
+ {
+ if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
+ (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
+ {
+ s->dsp.idct_put = idct_put_altivec;
+ s->dsp.idct_add = idct_add_altivec;
+ s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ }
+ }
+
+ // Test to make sure that the dct required alignments are met.
+ if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
+ (((long)(s->q_inter_matrix) & 0x0f) != 0))
+ {
+ av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
+ "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
+ return;
+ }
+
+ if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
+ {
+ av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
+ "to use AltiVec DCT. Reverting to non-AltiVec version.\n");
+ return;
+ }
+
+
+ if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
+ (s->avctx->dct_algo == FF_DCT_ALTIVEC))
+ {
+#if 0 /* seems to cause trouble under some circumstances */
+ s->dct_quantize = dct_quantize_altivec;
+#endif
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
+ }
+}
diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c
deleted file mode 100644
index c5e822f77..000000000
--- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2002 Dieter Shirley
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "../dsputil.h"
-#include "../mpegvideo.h"
-#include <time.h>
-
-#ifdef HAVE_ALTIVEC
-#include "dsputil_altivec.h"
-#endif
-
-extern int dct_quantize_altivec(MpegEncContext *s,
- DCTELEM *block, int n,
- int qscale, int *overflow);
-extern void dct_unquantize_h263_altivec(MpegEncContext *s,
- DCTELEM *block, int n, int qscale);
-
-extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
-extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
-
-
-void MPV_common_init_ppc(MpegEncContext *s)
-{
-#ifdef HAVE_ALTIVEC
- if (has_altivec())
- {
- if (s->avctx->lowres==0)
- {
- if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
- (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
- {
- s->dsp.idct_put = idct_put_altivec;
- s->dsp.idct_add = idct_add_altivec;
- s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
- }
- }
-
- // Test to make sure that the dct required alignments are met.
- if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
- (((long)(s->q_inter_matrix) & 0x0f) != 0))
- {
- av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
- "to use Altivec DCT. Reverting to non-altivec version.\n");
- return;
- }
-
- if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
- {
- av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
- "to use Altivec DCT. Reverting to non-altivec version.\n");
- return;
- }
-
-
- if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
- (s->avctx->dct_algo == FF_DCT_ALTIVEC))
- {
-#if 0 /* seems to cause trouble under some circumstances */
- s->dct_quantize = dct_quantize_altivec;
-#endif
- s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
- s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
- }
- } else
-#endif
- {
- /* Non-AltiVec PPC optimisations here */
- }
-}
-
diff --git a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c b/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c
index b15672ffe..8770f05f5 100644
--- a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c
@@ -1,5 +1,5 @@
/*
- * Altivec optimized snow DSP utils
+ * AltiVec-optimized snow DSP utils
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
@@ -17,15 +17,13 @@
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- *
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
#include "dsputil_altivec.h"
-#include "../snow.h"
+#include "snow.h"
#undef NDEBUG
#include <assert.h>
@@ -60,57 +58,56 @@ static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
//altivec code
-void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width)
+void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width)
{
+#if 0
const int w2= (width+1)>>1;
- DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]);
+ DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]);
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
- vector signed int t1, t2, x, y, tmp1, tmp2;
- vector signed int *vbuf, *vtmp;
+ vector signed short t1, t2, x, y, tmp1, tmp2;
+ vector signed short *vbuf, *vtmp;
vector unsigned char align;
-
-
{ // Lift 0
- DWTELEM * const ref = b + w2 - 1;
- DWTELEM b_0 = b[0];
- vbuf = (vector signed int *)b;
+ IDWTELEM * const ref = b + w2 - 1;
+ IDWTELEM b_0 = b[0];
+ vector signed short v7 = vec_splat_s16(7);
+ vbuf = (vector signed short *)b;
tmp1 = vec_ld (0, ref);
align = vec_lvsl (0, ref);
tmp2 = vec_ld (15, ref);
- t1= vec_perm(tmp1, tmp2, align);
-
- i = 0;
+ t1 = vec_perm(tmp1, tmp2, align);
for (i=0; i<w_l-15; i+=16) {
#if 0
- b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
+/* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
- b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);
+ b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/
+ b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8);
#else
- tmp1 = vec_ld (0, ref+4+i);
- tmp2 = vec_ld (15, ref+4+i);
+ tmp1 = vec_ld (0, ref+8+i);
+ tmp2 = vec_ld (15, ref+8+i);
t2 = vec_perm(tmp1, tmp2, align);
- y = vec_add(t1,vec_sld(t1,t2,4));
- y = vec_add(vec_add(y,y),y);
+ y = vec_add(t1, vec_sld(t1,t2,2));
+// y = vec_add(vec_add(y,y),y);
- tmp1 = vec_ld (0, ref+8+i);
+ tmp1 = vec_ld (0, ref+12+i);
y = vec_add(y, vec_splat_s32(4));
y = vec_sra(y, vec_splat_u32(3));
- tmp2 = vec_ld (15, ref+8+i);
+ tmp2 = vec_ld (15, ref+12+i);
*vbuf = vec_sub(*vbuf, y);
- t1=t2;
+ t1 = t2;
vbuf++;
@@ -164,6 +161,7 @@ void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width)
vbuf++;
#endif
+
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
@@ -365,6 +363,7 @@ void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width)
}
}
+#endif
}
void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width)
@@ -524,7 +523,7 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
- //FIXME ugly missue of obmc_stride
+ //FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
@@ -590,7 +589,7 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
- //FIXME ugly missue of obmc_stride
+ //FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
@@ -673,7 +672,7 @@ static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
- //FIXME ugly missue of obmc_stride
+ //FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
@@ -719,7 +718,7 @@ static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
- //FIXME ugly missue of obmc_stride
+ //FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
@@ -782,7 +781,9 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
+#if 0
c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
+#endif
}
diff --git a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h b/contrib/ffmpeg/libavcodec/ppc/types_altivec.h
index f29026e04..6d41a928b 100644
--- a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h
+++ b/contrib/ffmpeg/libavcodec/ppc/types_altivec.h
@@ -18,6 +18,9 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#ifndef FFMPEG_TYPES_ALTIVEC_H
+#define FFMPEG_TYPES_ALTIVEC_H
+
/***********************************************************************
* Vector types
**********************************************************************/
@@ -39,3 +42,5 @@
#define zero_s16v (vec_s16_t) zerov
#define zero_u32v (vec_u32_t) zerov
#define zero_s32v (vec_s32_t) zerov
+
+#endif /* FFMPEG_TYPES_ALTIVEC_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/util_altivec.h b/contrib/ffmpeg/libavcodec/ppc/util_altivec.h
new file mode 100644
index 000000000..6a8afb1b2
--- /dev/null
+++ b/contrib/ffmpeg/libavcodec/ppc/util_altivec.h
@@ -0,0 +1,105 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file util_altivec.h
+ * Contains misc utility macros and inline functions
+ */
+
+#ifndef FFMPEG_UTIL_ALTIVEC_H
+#define FFMPEG_UTIL_ALTIVEC_H
+
+#include <stdint.h>
+
+#include "config.h"
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a,b,c,d) (const vector unsigned char)AVV(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
+#define vcii(a,b,c,d) (const vector float)AVV(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
+
+// vcprmle is used to keep the same index as in the SSE version.
+// it's the same as vcprm, with the index inversed
+// ('le' is Little Endian)
+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
+
+// used to build inverse/identity vectors (vcii)
+// n is _n_egative, p is _p_ositive
+#define FLOAT_n -1.
+#define FLOAT_p 1.
+
+
+// Transpose 8x8 matrix of 16-bit elements (in-place)
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+ vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
+ vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
+ \
+ A1 = vec_mergeh (a, e); \
+ B1 = vec_mergel (a, e); \
+ C1 = vec_mergeh (b, f); \
+ D1 = vec_mergel (b, f); \
+ E1 = vec_mergeh (c, g); \
+ F1 = vec_mergel (c, g); \
+ G1 = vec_mergeh (d, h); \
+ H1 = vec_mergel (d, h); \
+ \
+ A2 = vec_mergeh (A1, E1); \
+ B2 = vec_mergel (A1, E1); \
+ C2 = vec_mergeh (B1, F1); \
+ D2 = vec_mergel (B1, F1); \
+ E2 = vec_mergeh (C1, G1); \
+ F2 = vec_mergel (C1, G1); \
+ G2 = vec_mergeh (D1, H1); \
+ H2 = vec_mergel (D1, H1); \
+ \
+ a = vec_mergeh (A2, E2); \
+ b = vec_mergel (A2, E2); \
+ c = vec_mergeh (B2, F2); \
+ d = vec_mergel (B2, F2); \
+ e = vec_mergeh (C2, G2); \
+ f = vec_mergel (C2, G2); \
+ g = vec_mergeh (D2, H2); \
+ h = vec_mergel (D2, H2); \
+} while (0)
+
+
+/** \brief loads unaligned vector \a *src with offset \a offset
+ and returns it */
+static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
+{
+ register vector unsigned char first = vec_ld(offset, src);
+ register vector unsigned char second = vec_ld(offset+15, src);
+ register vector unsigned char mask = vec_lvsl(offset, src);
+ return vec_perm(first, second, mask);
+}
+
+#endif /* FFMPEG_UTIL_ALTIVEC_H */
diff --git a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c b/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c
index 114c9d41f..87bef808e 100644
--- a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c
+++ b/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c
@@ -17,14 +17,13 @@
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
*/
-#include "../dsputil.h"
+#include "dsputil.h"
#include "gcc_fixes.h"
-#include "dsputil_altivec.h"
+#include "util_altivec.h"
// main steps of 8x8 transform
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
@@ -139,7 +138,6 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
- const vector unsigned int vec_5 = vec_splat_u32(5);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
@@ -229,7 +227,7 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
/** Do inverse transform on 8x4 part of block
*/
-static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n)
+static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
@@ -243,6 +241,9 @@ static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n)
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector unsigned int vec_1 = vec_splat_u32(1);
+ vector unsigned char tmp;
+ vector signed short tmp2, tmp3;
+ vector unsigned char perm0, perm1, p0, p1, p;
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
@@ -284,51 +285,42 @@ static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n)
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
- if(!n){ // upper half of block
- s0 = vec_unpackh(src0);
- s1 = vec_unpackh(src1);
- s2 = vec_unpackh(src2);
- s3 = vec_unpackh(src3);
- s8 = vec_unpackl(src0);
- s9 = vec_unpackl(src1);
- sA = vec_unpackl(src2);
- sB = vec_unpackl(src3);
- STEP4(s0, s1, s2, s3, vec_64);
- SHIFT_VERT4(s0, s1, s2, s3);
- STEP4(s8, s9, sA, sB, vec_64);
- SHIFT_VERT4(s8, s9, sA, sB);
- src0 = vec_pack(s0, s8);
- src1 = vec_pack(s1, s9);
- src2 = vec_pack(s2, sA);
- src3 = vec_pack(s3, sB);
+ s0 = vec_unpackh(src0);
+ s1 = vec_unpackh(src1);
+ s2 = vec_unpackh(src2);
+ s3 = vec_unpackh(src3);
+ s8 = vec_unpackl(src0);
+ s9 = vec_unpackl(src1);
+ sA = vec_unpackl(src2);
+ sB = vec_unpackl(src3);
+ STEP4(s0, s1, s2, s3, vec_64);
+ SHIFT_VERT4(s0, s1, s2, s3);
+ STEP4(s8, s9, sA, sB, vec_64);
+ SHIFT_VERT4(s8, s9, sA, sB);
+ src0 = vec_pack(s0, s8);
+ src1 = vec_pack(s1, s9);
+ src2 = vec_pack(s2, sA);
+ src3 = vec_pack(s3, sB);
+
+ p0 = vec_lvsl (0, dest);
+ p1 = vec_lvsl (stride, dest);
+ p = vec_splat_u8 (-1);
+ perm0 = vec_mergeh (p, p0);
+ perm1 = vec_mergeh (p, p1);
- vec_st(src0, 0, block);
- vec_st(src1, 16, block);
- vec_st(src2, 32, block);
- vec_st(src3, 48, block);
- } else { //lower half of block
- s0 = vec_unpackh(src4);
- s1 = vec_unpackh(src5);
- s2 = vec_unpackh(src6);
- s3 = vec_unpackh(src7);
- s8 = vec_unpackl(src4);
- s9 = vec_unpackl(src5);
- sA = vec_unpackl(src6);
- sB = vec_unpackl(src7);
- STEP4(s0, s1, s2, s3, vec_64);
- SHIFT_VERT4(s0, s1, s2, s3);
- STEP4(s8, s9, sA, sB, vec_64);
- SHIFT_VERT4(s8, s9, sA, sB);
- src4 = vec_pack(s0, s8);
- src5 = vec_pack(s1, s9);
- src6 = vec_pack(s2, sA);
- src7 = vec_pack(s3, sB);
+#define ADD(dest,src,perm) \
+ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
+ tmp = vec_ld (0, dest); \
+ tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \
+ tmp3 = vec_adds (tmp2, src); \
+ tmp = vec_packsu (tmp3, tmp3); \
+ vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
+ vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
- vec_st(src4, 64, block);
- vec_st(src5, 80, block);
- vec_st(src6, 96, block);
- vec_st(src7,112, block);
- }
+ ADD (dest, src0, perm0) dest += stride;
+ ADD (dest, src1, perm1) dest += stride;
+ ADD (dest, src2, perm0) dest += stride;
+ ADD (dest, src3, perm1)
}