/* * mmx_mlib.c * * Intel MMX implementation of motion comp routines. * MMX code written by David I. Lehn . * lib{mmx,xmmx,sse} can be found at http://shay.ecn.purdue.edu/~swar/ * * Copyright 2000, David I. Lehn . Released under GPL. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, * */ #include #include #include #include "include/debug_print.h" #include "mmx.h" #include "mmx_mlib.h" static uint64_t ones = 0x0001000100010001ULL; static uint64_t twos = 0x0002000200020002ULL; //#define MC_MMX_verify void mlib_Init(void) { DNOTE("Using MMX accelerated media functions\n"); return; } /* static inline uint8_t clip_to_u8 (int16_t value) { //return value < 0 ? 0 : (value > 255 ? 255 : value); return ((uint16_t)value) > 256 ? value < 0 ? 0 : 255 : value; } */ static inline void mmx_average_2_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2) { // // *dst = (*src1 + *src2 + 1)/2; // //pxor_r2r(mm0,mm0); // load 0 into mm0 movq_m2r(*src1,mm1); // load 8 src1 bytes movq_r2r(mm1,mm2); // copy 8 src1 bytes movq_m2r(*src2,mm3); // load 8 src2 bytes movq_r2r(mm3,mm4); // copy 8 src2 bytes punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes paddw_m2r(ones, mm1); paddw_r2r(mm3,mm1); // add lows to mm1 psraw_i2r(1,mm1); // /2 paddw_m2r(ones, mm2); paddw_r2r(mm4,mm2); // add highs to mm2 psraw_i2r(1,mm2); // /2 packuswb_r2r(mm2,mm1); // pack (w/ saturation) movq_r2m(mm1,*dst); // store result in dst } static inline void mmx_interp_average_2_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2) { // // *dst = (*dst + (*src1 + *src2 + 1)/2 + 1)/2; // //pxor_r2r(mm0,mm0); // load 0 into mm0 movq_m2r(*dst,mm1); // load 8 dst bytes movq_r2r(mm1,mm2); // copy 8 dst bytes movq_m2r(*src1,mm3); // load 8 src1 bytes movq_r2r(mm3,mm4); // copy 8 src1 bytes movq_m2r(*src2,mm5); // load 8 src2 bytes movq_r2r(mm5,mm6); // copy 8 src2 bytes punpcklbw_r2r(mm0,mm1); // unpack low dst bytes punpckhbw_r2r(mm0,mm2); // unpack high dst bytes punpcklbw_r2r(mm0,mm3); // unpack low src1 bytes punpckhbw_r2r(mm0,mm4); // unpack high src1 bytes punpcklbw_r2r(mm0,mm5); // unpack low src2 bytes punpckhbw_r2r(mm0,mm6); // unpack high src2 bytes paddw_m2r(ones, mm3); paddw_r2r(mm5,mm3); // add lows paddw_m2r(ones, mm4); paddw_r2r(mm6,mm4); // add highs psraw_i2r(1,mm3); // /2 psraw_i2r(1,mm4); // /2 paddw_m2r(ones, mm1); paddw_r2r(mm3,mm1); // add lows paddw_m2r(ones, mm2); paddw_r2r(mm4,mm2); // add highs psraw_i2r(1,mm1); // /2 psraw_i2r(1,mm2); // /2 packuswb_r2r(mm2,mm1); // pack (w/ saturation) movq_r2m(mm1,*dst); // store result in dst } static inline void mmx_average_4_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4) { // // *dst = (*src1 + *src2 + *src3 + *src4 + 2)/4; // //pxor_r2r(mm0,mm0); // load 0 into mm0 movq_m2r(*src1,mm1); // load 8 src1 bytes movq_r2r(mm1,mm2); // copy 8 src1 bytes punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes movq_m2r(*src2,mm3); // load 8 src2 bytes movq_r2r(mm3,mm4); // copy 8 src2 bytes punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes paddw_r2r(mm3,mm1); // add lows paddw_r2r(mm4,mm2); // add highs // now have partials in mm1 and mm2 movq_m2r(*src3,mm3); // load 8 src3 bytes movq_r2r(mm3,mm4); // copy 8 src3 bytes punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes paddw_r2r(mm3,mm1); // add lows paddw_r2r(mm4,mm2); // add highs movq_m2r(*src4,mm5); // load 8 src4 bytes movq_r2r(mm5,mm6); // copy 8 src4 bytes punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes paddw_m2r(twos, mm1); paddw_r2r(mm5,mm1); // add lows paddw_m2r(twos, mm2); paddw_r2r(mm6,mm2); // add highs // now have subtotal in mm1 and mm2 psraw_i2r(2,mm1); // /4 psraw_i2r(2,mm2); // /4 packuswb_r2r(mm2,mm1); // pack (w/ saturation) movq_r2m(mm1,*dst); // store result in dst } static inline void mmx_interp_average_4_U8(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4) { // // *dst = (*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2; // //pxor_r2r(mm0,mm0); // load 0 into mm0 movq_m2r(*src1,mm1); // load 8 src1 bytes movq_r2r(mm1,mm2); // copy 8 src1 bytes punpcklbw_r2r(mm0,mm1); // unpack low src1 bytes punpckhbw_r2r(mm0,mm2); // unpack high src1 bytes movq_m2r(*src2,mm3); // load 8 src2 bytes movq_r2r(mm3,mm4); // copy 8 src2 bytes punpcklbw_r2r(mm0,mm3); // unpack low src2 bytes punpckhbw_r2r(mm0,mm4); // unpack high src2 bytes paddw_r2r(mm3,mm1); // add lows paddw_r2r(mm4,mm2); // add highs // now have partials in mm1 and mm2 movq_m2r(*src3,mm3); // load 8 src3 bytes movq_r2r(mm3,mm4); // copy 8 src3 bytes punpcklbw_r2r(mm0,mm3); // unpack low src3 bytes punpckhbw_r2r(mm0,mm4); // unpack high src3 bytes paddw_r2r(mm3,mm1); // add lows paddw_r2r(mm4,mm2); // add highs movq_m2r(*src4,mm5); // load 8 src4 bytes movq_r2r(mm5,mm6); // copy 8 src4 bytes punpcklbw_r2r(mm0,mm5); // unpack low src4 bytes punpckhbw_r2r(mm0,mm6); // unpack high src4 bytes paddw_m2r(twos, mm1); paddw_r2r(mm5,mm1); // add lows paddw_m2r(twos, mm2); paddw_r2r(mm6,mm2); // add highs psraw_i2r(2,mm1); // /4 psraw_i2r(2,mm2); // /4 // now have subtotal/4 in mm1 and mm2 movq_m2r(*dst,mm3); // load 8 dst bytes movq_r2r(mm3,mm4); // copy 8 dst bytes punpcklbw_r2r(mm0,mm3); // unpack low dst bytes punpckhbw_r2r(mm0,mm4); // unpack high dst bytes paddw_m2r(ones, mm1); paddw_r2r(mm3,mm1); // add lows paddw_m2r(ones, mm2); paddw_r2r(mm4,mm2); // add highs psraw_i2r(1,mm1); // /2 psraw_i2r(1,mm2); // /2 // now have end value in mm1 and mm2 packuswb_r2r(mm2,mm1); // pack (w/ saturation) movq_r2m(mm1,*dst); // store result in dst } // VideoCopyRef* - Copy block from reference block to current block // --------------------------------------------------------------- static inline void mlib_VideoCopyRefAve_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { #define MMX_mmx_VideoCopyRefAve_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoCopyRefAve_U8_U8_MxN) int x,y; const int jump = stride - m; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*curr_block + *ref_block++ + 1)/2; ref_block += jump; curr_block += jump; } #else int x,y; const int step = 8; const int jump = stride - m; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_average_2_U8(curr_block, curr_block, ref_block); curr_block += step; ref_block += step; } curr_block += jump; ref_block += jump; } #endif } void mlib_VideoCopyRefAve_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRefAve_U8_U8_MxN( 16, 16, curr_block, ref_block, stride); } void mlib_VideoCopyRefAve_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRefAve_U8_U8_MxN( 16, 8, curr_block, ref_block, stride); } void mlib_VideoCopyRefAve_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRefAve_U8_U8_MxN( 8, 8, curr_block, ref_block, stride); } void mlib_VideoCopyRefAve_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRefAve_U8_U8_MxN( 8, 4, curr_block, ref_block, stride); } #if 0 inline void mlib_VideoCopyRef_U8_U8( uint8_t *curr_block, const uint8_t *ref_block, int32_t width, int32_t height, int32_t stride) { int x,y; const int jump = stride - width; printf("I don't get called!"); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) *curr_block++ = *ref_block++; ref_block += jump; curr_block += jump; } } #endif void print_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *block0, uint8_t *block1, int32_t stride) { int x,y; int jump = stride - m; printf("block %dx%d @ %x<-%x stride=%d\n",n,n,(uint32_t)block0,(uint32_t)block1,stride); for (y = 0; y < n; y++) { printf("%2d: ",y); for (x = 0; x < m; x++) { printf("%3d<%3d ",*block0++,*block1++); } printf("\n"); block0 += jump; block1 += jump; } } static inline void mlib_VideoCopyRef_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { #define MMX_mmx_VideoCopyRef_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoCopyRef_U8_U8_MxN) int x,y; const int jump = stride - m; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = *ref_block++; ref_block += jump; curr_block += jump; } #else int x,y; const int step = 8; const int jump = stride - m; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { movq_m2r(*ref_block,mm1); // load 8 ref bytes movq_r2m(mm1,*curr_block); // store 8 bytes at curr curr_block += step; ref_block += step; } curr_block += jump; ref_block += jump; } #endif } void mlib_VideoCopyRef_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRef_U8_U8_MxN( 16, 16, curr_block, ref_block, stride); } void mlib_VideoCopyRef_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRef_U8_U8_MxN( 16, 8, curr_block, ref_block, stride); } void mlib_VideoCopyRef_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRef_U8_U8_MxN( 8, 8, curr_block, ref_block, stride); } void mlib_VideoCopyRef_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t stride) { mlib_VideoCopyRef_U8_U8_MxN( 8, 4, curr_block, ref_block, stride); } // VideoInterp*X - Half pixel interpolation in the x direction // ------------------------------------------------------------------ static inline void mlib_VideoInterpAveX_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { #define MMX_mmx_VideoInterpAveX_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpAveX_U8_U8_MxN) int x,y; const int jump = frame_stride - m; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*curr_block + (*ref_block++ + *(ref_block + 1) + 1)/2 + 1)/2; ref_block += jump; curr_block += jump; } #else int x,y; const int step = 8; const int jump = frame_stride - m; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_interp_average_2_U8(curr_block, ref_block, ref_block + 1); curr_block += step; ref_block += step; } curr_block += jump; ref_block += jump; } #endif } void mlib_VideoInterpAveX_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveX_U8_U8_MxN( 16, 16, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveX_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveX_U8_U8_MxN( 16, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveX_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveX_U8_U8_MxN( 8, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveX_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveX_U8_U8_MxN( 8, 4, curr_block, ref_block, frame_stride, field_stride); } static inline void mlib_VideoInterpX_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { #define MMX_mmx_VideoInterpX_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpX_U8_U8_MxN) int x,y; const int jump = frame_stride - m; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*ref_block++ + *(ref_block + 1) + 1)/2; ref_block += jump; curr_block += jump; } #else int x,y; const int step = 8; const int jump = frame_stride - m; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_average_2_U8(curr_block, ref_block, ref_block + 1); curr_block += step; ref_block += step; } curr_block += jump; ref_block += jump; } #endif } void mlib_VideoInterpX_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpX_U8_U8_MxN( 16, 16, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpX_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpX_U8_U8_MxN( 16, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpX_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpX_U8_U8_MxN( 8, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpX_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpX_U8_U8_MxN( 8, 4, curr_block, ref_block, frame_stride, field_stride); } // VideoInterp*XY - half pixel interpolation in both x and y directions // -------------------------------------------------------------------- static inline void mlib_VideoInterpAveXY_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { #define MMX_mmx_VideoInterpAveXY_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpAveXY_U8_U8_MxN) int x,y; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*curr_block + (*ref_block++ + *(ref_block + 1) + *ref_block_next++ + *(ref_block_next + 1) + 2)/4 + 1)/2; curr_block += jump; ref_block += jump; ref_block_next += jump; } #else int x,y; const int step = 8; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_interp_average_4_U8(curr_block, ref_block, ref_block + 1, ref_block_next, ref_block_next + 1); curr_block += step; ref_block += step; ref_block_next += step; } curr_block += jump; ref_block += jump; ref_block_next += jump; } #endif } void mlib_VideoInterpAveXY_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveXY_U8_U8_MxN( 16, 16, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveXY_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveXY_U8_U8_MxN( 16, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveXY_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveXY_U8_U8_MxN( 8, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveXY_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveXY_U8_U8_MxN( 8, 4, curr_block, ref_block, frame_stride, field_stride); } static inline void mlib_VideoInterpXY_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { #define MMX_mmx_VideoInterpXY_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpXY_U8_U8_MxN) int x,y; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*ref_block++ + *(ref_block + 1) + *ref_block_next++ + *(ref_block_next + 1) + 2)/4; curr_block += jump; ref_block += jump; ref_block_next += jump; } #else int x,y; const int step = 8; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_average_4_U8(curr_block, ref_block, ref_block + 1, ref_block_next, ref_block_next + 1); curr_block += step; ref_block += step; ref_block_next += step; } curr_block += jump; ref_block += jump; ref_block_next += jump; } #endif } void mlib_VideoInterpXY_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpXY_U8_U8_MxN( 16, 16, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpXY_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpXY_U8_U8_MxN( 16, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpXY_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpXY_U8_U8_MxN( 8, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpXY_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpXY_U8_U8_MxN( 8, 4, curr_block, ref_block, frame_stride, field_stride); } // VideoInterp*Y - half pixel interpolation in the y direction // ----------------------------------------------------------- static inline void mlib_VideoInterpAveY_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { #define MMX_mmx_VideoInterpAveY_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpAveY_U8_U8_MxN) int x,y; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*curr_block + (*ref_block++ + *ref_block_next++ + 1)/2 + 1)/2; curr_block += jump; ref_block += jump; ref_block_next += jump; } #else int x,y; const int step = 8; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_interp_average_2_U8(curr_block, ref_block, ref_block_next); curr_block += step; ref_block += step; ref_block_next += step; } curr_block += jump; ref_block += jump; ref_block_next += jump; } #endif } void mlib_VideoInterpAveY_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveY_U8_U8_MxN( 16, 16, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveY_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveY_U8_U8_MxN( 16, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveY_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveY_U8_U8_MxN( 8, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpAveY_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpAveY_U8_U8_MxN( 8, 4, curr_block, ref_block, frame_stride, field_stride); } static inline void mlib_VideoInterpY_U8_U8_MxN( const uint8_t m, const uint8_t n, uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { #define MMX_mmx_VideoInterpY_U8_U8_MxN #if !defined(HAVE_MMX) || !defined(MMX_mmx_VideoInterpY_U8_U8_MxN) int x,y; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; for (y = 0; y < n; y++) { for (x = 0; x < m; x++) *curr_block++ = (*ref_block++ + *ref_block_next++ + 1)/2; curr_block += jump; ref_block += jump; ref_block_next += jump; } #else int x,y; const int step = 8; const int jump = frame_stride - m; const uint8_t *ref_block_next = ref_block + field_stride; pxor_r2r(mm0,mm0); // load 0 into mm0 for (y = 0; y < n; y++) { for (x = 0; x < m/8; x++) { mmx_average_2_U8(curr_block, ref_block, ref_block_next); curr_block += step; ref_block += step; ref_block_next += step; } curr_block += jump; ref_block += jump; ref_block_next += jump; } #endif } void mlib_VideoInterpY_U8_U8_16x16( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpY_U8_U8_MxN( 16, 16, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpY_U8_U8_16x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpY_U8_U8_MxN( 16, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpY_U8_U8_8x8( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpY_U8_U8_MxN( 8, 8, curr_block, ref_block, frame_stride, field_stride); } void mlib_VideoInterpY_U8_U8_8x4( uint8_t *curr_block, const uint8_t *ref_block, int32_t frame_stride, int32_t field_stride) { mlib_VideoInterpY_U8_U8_MxN( 8, 4, curr_block, ref_block, frame_stride, field_stride); }