1
0
mirror of https://github.com/godotengine/godot.git synced 2025-11-24 15:26:15 +00:00

libwebp: Sync with upstream 1.2.1

Changes: https://chromium.googlesource.com/webm/libwebp/+/1.2.1/NEWS
This commit is contained in:
Rémi Verschelde
2021-11-19 13:54:40 +01:00
parent 42f8bfaff0
commit 41ce417847
71 changed files with 1649 additions and 1138 deletions

View File

@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
}
}
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
#undef MFIX
void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
//------------------------------------------------------------------------------
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
}
}
void WebPMultRows(uint8_t* ptr, int stride,
const uint8_t* alpha, int alpha_stride,
void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int num_rows, int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
}
#if !WEBP_NEON_OMIT_C_CODE
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
uint32_t alpha_mask = 0xff;
int i, j;
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff);
}
static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint32_t* WEBP_RESTRICT dst,
int dst_stride) {
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
}
}
static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
uint8_t alpha_mask = 0xff;
int i, j;
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
return (alpha_mask == 0xff);
}
static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
@@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
return 0;
}
static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
int x;
for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
}
//------------------------------------------------------------------------------
// Simple channel manipulations.
@@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
}
#ifdef WORDS_BIGENDIAN
static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT r,
const uint8_t* WEBP_RESTRICT g,
const uint8_t* WEBP_RESTRICT b,
int len, uint32_t* WEBP_RESTRICT out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
@@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
}
#endif
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
const uint8_t* WEBP_RESTRICT g,
const uint8_t* WEBP_RESTRICT b,
int len, int step, uint32_t* WEBP_RESTRICT out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
@@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
uint8_t* WEBP_RESTRICT, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
uint32_t* WEBP_RESTRICT, int);
int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
uint8_t* WEBP_RESTRICT, int);
void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT alpha, int size);
#ifdef WORDS_BIGENDIAN
void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int, uint32_t*);
#endif
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
const uint8_t* WEBP_RESTRICT g,
const uint8_t* WEBP_RESTRICT b,
int len, int step, uint32_t* WEBP_RESTRICT out);
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
//------------------------------------------------------------------------------
// Init function
@@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
WebPHasAlpha8b = HasAlpha8b_C;
WebPHasAlpha32b = HasAlpha32b_C;
WebPAlphaReplace = AlphaReplace_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitAlphaProcessingSSE2();
#if defined(WEBP_USE_SSE41)
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitAlphaProcessingSSE41();
}
@@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitAlphaProcessingNEON();
@@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
assert(WebPPackRGB != NULL);
assert(WebPHasAlpha8b != NULL);
assert(WebPHasAlpha32b != NULL);
assert(WebPAlphaReplace != NULL);
}

View File

@@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
//------------------------------------------------------------------------------
static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
@@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xffffffffu);
}
static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint32_t* WEBP_RESTRICT dst,
int dst_stride) {
int i, j;
uint8x8x4_t greens; // leave A/R/B channels zero'd.
greens.val[0] = vdup_n_u8(0);
@@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
}
}
static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
@@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
return (alpha_mask == 0xffffffffu);
}
static void ExtractGreen_NEON(const uint32_t* argb,
uint8_t* alpha, int size) {
static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT alpha, int size) {
int i;
for (i = 0; i + 16 <= size; i += 16) {
const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));

View File

@@ -18,9 +18,9 @@
//------------------------------------------------------------------------------
static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff);
}
static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint32_t* WEBP_RESTRICT dst,
int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
@@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
}
}
static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
return 0;
}
static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
const __m128i m_color = _mm_set1_epi32(color);
const __m128i zero = _mm_setzero_si128();
int i = 0;
for (; i + 8 <= length; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
const __m128i b0 = _mm_srai_epi32(a0, 24);
const __m128i b1 = _mm_srai_epi32(a1, 24);
const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
const __m128i d0 = _mm_and_si128(c0, m_color);
const __m128i d1 = _mm_and_si128(c1, m_color);
const __m128i e0 = _mm_andnot_si128(c0, a0);
const __m128i e1 = _mm_andnot_si128(c1, a1);
_mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
_mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
}
for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
}
// -----------------------------------------------------------------------------
// Apply alpha value to rows
@@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse) {
int x = 0;
if (!inverse) {
@@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPHasAlpha8b = HasAlpha8b_SSE2;
WebPHasAlpha32b = HasAlpha32b_SSE2;
WebPAlphaReplace = AlphaReplace_SSE2;
}
#else // !WEBP_USE_SSE2

View File

@@ -19,9 +19,9 @@
//------------------------------------------------------------------------------
static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
int argb_stride, int width, int height,
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;

View File

@@ -395,12 +395,12 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
VP8EncDspCostInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspCostInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8EncDspCostInitNEON();
}

View File

@@ -55,12 +55,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type), "c"(0));
}
#elif (defined(_M_X64) || defined(_M_IX86)) && \
defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
#include <intrin.h>
#define GetCPUInfo(info, type) __cpuidex(info, type, 0) // set ecx=0
#elif defined(WEBP_MSC_SSE2)
#define WEBP_HAVE_MSC_CPUID
#elif _MSC_VER > 1310
#include <intrin.h>
#define GetCPUInfo __cpuid
#define WEBP_HAVE_MSC_CPUID
#endif
#endif
// NaCl has no support for xgetbv or the raw opcode.
@@ -94,7 +100,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
// helper function for run-time detection of slow SSSE3 platforms
static int CheckSlowModel(int info) {
@@ -179,9 +185,34 @@ static int AndroidCPUInfo(CPUFeature feature) {
return 0;
}
VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
#elif defined(WEBP_USE_NEON)
// define a dummy function to enable turning off NEON at runtime by setting
// VP8DecGetCPUInfo = NULL
#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
// Use compile flags as an indicator of SIMD support instead of a runtime check.
static int wasmCPUInfo(CPUFeature feature) {
switch (feature) {
#ifdef WEBP_HAVE_SSE2
case kSSE2:
return 1;
#endif
#ifdef WEBP_HAVE_SSE41
case kSSE3:
case kSlowSSSE3:
case kSSE4_1:
return 1;
#endif
#ifdef WEBP_HAVE_NEON
case kNEON:
return 1;
#endif
default:
break;
}
return 0;
}
VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
#elif defined(WEBP_HAVE_NEON)
// In most cases this function doesn't check for NEON support (it's assumed by
// the configuration), but enables turning off NEON at runtime, for testing
// purposes, by setting VP8DecGetCPUInfo = NULL.
static int armCPUInfo(CPUFeature feature) {
if (feature != kNEON) return 0;
#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)

View File

@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8DspInitSSE2();
#if defined(WEBP_USE_SSE41)
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8DspInitSSE41();
}
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8DspInitNEON();

View File

@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) { // DC
const uint8x8_t A = vld1_u8(dst - BPS); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
const uint16x8_t s0 = vaddq_u16(L0, L1);
const uint16x8_t s1 = vaddq_u16(L2, L3);
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
const uint16x8_t s0 = vaddl_u8(L0, L1);
const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
@@ -1429,8 +1429,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x8_t A = vld1_u8(dst - BPS); // top row
#if defined(__aarch64__)
const uint16x8_t B = vmovl_u8(A);
const uint16_t p2 = vaddvq_u16(B);
const uint16_t p2 = vaddlv_u8(A);
sum_top = vdupq_n_u16(p2);
#else
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
@@ -1441,18 +1440,18 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
}
if (do_left) {
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
const uint16x8_t s0 = vaddq_u16(L0, L1);
const uint16x8_t s1 = vaddq_u16(L2, L3);
const uint16x8_t s2 = vaddq_u16(L4, L5);
const uint16x8_t s3 = vaddq_u16(L6, L7);
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
const uint16x8_t s0 = vaddl_u8(L0, L1);
const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s2 = vaddl_u8(L4, L5);
const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
sum_left = vaddq_u16(s01, s23);
@@ -1512,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
if (do_top) {
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
#if defined(__aarch64__)
const uint16_t p3 = vaddlvq_u8(A);
sum_top = vdupq_n_u16(p3);
#else
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
sum_top = vcombine_u16(p3, p3);
#endif
}
if (do_left) {
int i;
sum_left = vdupq_n_u16(0);
for (i = 0; i < 16; i += 8) {
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
const uint16x8_t s0 = vaddq_u16(L0, L1);
const uint16x8_t s1 = vaddq_u16(L2, L3);
const uint16x8_t s2 = vaddq_u16(L4, L5);
const uint16x8_t s3 = vaddq_u16(L6, L7);
const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
const uint16x8_t s0 = vaddl_u8(L0, L1);
const uint16x8_t s1 = vaddl_u8(L2, L3);
const uint16x8_t s2 = vaddl_u8(L4, L5);
const uint16x8_t s3 = vaddl_u8(L6, L7);
const uint16x8_t s01 = vaddq_u16(s0, s1);
const uint16x8_t s23 = vaddq_u16(s2, s3);
const uint16x8_t sum = vaddq_u16(s01, s23);

View File

@@ -26,6 +26,23 @@ extern "C" {
#define BPS 32 // this is the common stride for enc/dec
//------------------------------------------------------------------------------
// WEBP_RESTRICT
// Declares a pointer with the restrict type qualifier if available.
// This allows code to hint to the compiler that only this pointer references a
// particular object or memory region within the scope of the block in which it
// is declared. This may allow for improved optimizations due to the lack of
// pointer aliasing. See also:
// https://en.cppreference.com/w/c/language/restrict
#if defined(__GNUC__)
#define WEBP_RESTRICT __restrict__
#elif defined(_MSC_VER)
#define WEBP_RESTRICT __restrict
#else
#define WEBP_RESTRICT
#endif
//------------------------------------------------------------------------------
// CPU detection
@@ -51,9 +68,7 @@ extern "C" {
# define __has_builtin(x) 0
#endif
// for now, none of the optimizations below are available in emscripten
#if !defined(EMSCRIPTEN)
#if !defined(HAVE_CONFIG_H)
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
@@ -63,23 +78,37 @@ extern "C" {
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets
#endif
#endif
// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
// files without intrinsics, allowing the corresponding Init() to be called.
// Files containing intrinsics will need to be built targeting the instruction
// set so should succeed on one of the earlier tests.
#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
#define WEBP_USE_SSE2
#endif
#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
#define WEBP_HAVE_SSE2
#endif
#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
#define WEBP_USE_SSE41
#endif
#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
#define WEBP_HAVE_SSE41
#endif
#undef WEBP_MSC_SSE41
#undef WEBP_MSC_SSE2
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
#if (defined(__ARM_NEON__) || \
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
@@ -95,6 +124,10 @@ extern "C" {
#define WEBP_USE_INTRINSICS
#endif
#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
#define WEBP_HAVE_NEON
#endif
#if defined(__mips__) && !defined(__mips64) && \
defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
#define WEBP_USE_MIPS32
@@ -110,13 +143,11 @@ extern "C" {
#define WEBP_USE_MSA
#endif
#endif /* EMSCRIPTEN */
#ifndef WEBP_DSP_OMIT_C_CODE
#define WEBP_DSP_OMIT_C_CODE 1
#endif
#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
#define WEBP_NEON_OMIT_C_CODE 1
#else
#define WEBP_NEON_OMIT_C_CODE 0
@@ -193,6 +224,12 @@ extern "C" {
#endif
#endif
// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
#if !defined(WEBP_OFFSET_PTR)
#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
#endif
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
#if !defined(WEBP_SWAP_16BIT_CSP)
#define WEBP_SWAP_16BIT_CSP 0
@@ -572,26 +609,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
// Returns true if alpha[] plane has non-trivial values different from 0xff.
extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride);
extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint8_t* WEBP_RESTRICT dst, int dst_stride);
// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride);
extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
int alpha_stride, int width, int height,
uint32_t* WEBP_RESTRICT dst,
int dst_stride);
// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlpha).
// Returns true if there's only trivial 0xff alpha values.
extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride);
extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
int argb_stride, int width, int height,
uint8_t* WEBP_RESTRICT alpha,
int alpha_stride);
// Extract the green values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlphaToGreen).
extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
uint8_t* WEBP_RESTRICT alpha, int size);
// Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B).
// Un-Multiply operation transforms x into x * 255 / A.
@@ -604,34 +644,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
int inverse);
// Same for a row of single values, with side alpha values.
extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
// Same a WebPMultRow(), but for several 'num_rows' rows.
void WebPMultRows(uint8_t* ptr, int stride,
const uint8_t* alpha, int alpha_stride,
void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
const uint8_t* WEBP_RESTRICT const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
#ifdef WORDS_BIGENDIAN
// ARGB packing function: a/r/g/b input is rgba or bgra order.
extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out);
extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
const uint8_t* WEBP_RESTRICT r,
const uint8_t* WEBP_RESTRICT g,
const uint8_t* WEBP_RESTRICT b,
int len, uint32_t* WEBP_RESTRICT out);
#endif
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
const uint8_t* WEBP_RESTRICT g,
const uint8_t* WEBP_RESTRICT b,
int len, int step, uint32_t* WEBP_RESTRICT out);
// This function returns true if src[i] contains a value different from 0xff.
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
// This function returns true if src[4*i] contains a value different from 0xff.
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
// replaces transparent values in src[] by 'color'.
extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
// To be called first before using the above.
void WebPInitAlphaProcessing(void);

View File

@@ -773,10 +773,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
#if defined(WEBP_USE_SSE41)
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8EncDspInitSSE41();
}
@@ -800,7 +800,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();

View File

@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8FiltersInitSSE2();
}
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8FiltersInitNEON();

View File

@@ -320,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
#if defined(CHROMIUM)
// TODO(crbug.com/654974)
(void)VerticalUnfilter_SSE2;
#else
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
#endif
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;

View File

@@ -107,62 +107,62 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------
// Predictors
static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
(void)top;
return left;
}
static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[0];
}
static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[1];
}
static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[-1];
}
static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
@@ -182,18 +182,18 @@ static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
out[i] = left = VP8LAddPixels(in[i], left);
}
}
GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
//------------------------------------------------------------------------------
@@ -562,7 +562,6 @@ VP8LPredictorFunc VP8LPredictors[16];
// exposed plain-C implementations
VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
VP8LPredictorFunc VP8LPredictors_C[16];
VP8LTransformColorInverseFunc VP8LTransformColorInverse;
@@ -576,6 +575,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
VP8LMapAlphaFunc VP8LMapColor8b;
extern void VP8LDspInitSSE2(void);
extern void VP8LDspInitSSE41(void);
extern void VP8LDspInitNEON(void);
extern void VP8LDspInitMIPSdspR2(void);
extern void VP8LDspInitMSA(void);
@@ -600,8 +600,7 @@ extern void VP8LDspInitMSA(void);
} while (0);
WEBP_DSP_INIT_FUNC(VP8LDspInit) {
COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
@@ -623,9 +622,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LDspInitSSE2();
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8LDspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
@@ -640,7 +644,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LDspInitNEON();

View File

@@ -30,7 +30,22 @@ extern "C" {
typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16];
extern VP8LPredictorFunc VP8LPredictors_C[16];
uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
// These Add/Sub function expects upper[-1] and out[-1] to be readable.
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
const uint32_t* upper, int num_pixels,

View File

@@ -184,19 +184,6 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
} \
}
// It subtracts the prediction from the input pixel and stores the residual
// in the output pixel.
#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB) \
static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x); \
out[x] = VP8LSubPixels(in[x], pred); \
} \
}
#ifdef __cplusplus
} // extern "C"
#endif

View File

@@ -329,6 +329,15 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available
const int log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
v >>= log_cnt;
#else
int log_cnt = 0;
uint32_t y = 1;
int correction = 0;
@@ -339,6 +348,7 @@ static float FastSLog2Slow_C(uint32_t v) {
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
#endif
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
@@ -355,6 +365,14 @@ static float FastSLog2Slow_C(uint32_t v) {
static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
// use clz if available
const int log_cnt = BitsLog2Floor(v) - 7;
const uint32_t y = 1 << log_cnt;
const uint32_t orig_v = v;
double log_2;
v >>= log_cnt;
#else
int log_cnt = 0;
uint32_t y = 1;
const uint32_t orig_v = v;
@@ -364,6 +382,7 @@ static float FastLog2Slow_C(uint32_t v) {
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
#endif
log_2 = kLog2Table[v] + log_cnt;
if (orig_v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
@@ -702,140 +721,6 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
//------------------------------------------------------------------------------
// Image transforms.
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return Average2(Average2(a0, a2), a1);
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
return Average2(Average2(a0, a1), Average2(a2, a3));
}
static WEBP_INLINE uint32_t Clip255(uint32_t a) {
if (a < 256) {
return a;
}
// return 0, when a is a negative integer.
// return 255, when a is positive.
return ~a >> 24;
}
static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
return Clip255(a + b - c);
}
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
(c1 >> 16) & 0xff,
(c2 >> 16) & 0xff);
const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
(c1 >> 8) & 0xff,
(c2 >> 8) & 0xff);
const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
return Clip255(a + (a - b) / 2);
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
const uint32_t ave = Average2(c0, c1);
const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
#if defined(__arm__) && \
(LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
# define LOCAL_INLINE __attribute__ ((noinline))
#else
# define LOCAL_INLINE WEBP_INLINE
#endif
static LOCAL_INLINE int Sub3(int a, int b, int c) {
const int pb = b - c;
const int pa = a - c;
return abs(pb) - abs(pa);
}
#undef LOCAL_INLINE
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
const int pa_minus_pb =
Sub3((a >> 24) , (b >> 24) , (c >> 24) ) +
Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
Sub3((a >> 8) & 0xff, (b >> 8) & 0xff, (c >> 8) & 0xff) +
Sub3((a ) & 0xff, (b ) & 0xff, (c ) & 0xff);
return (pa_minus_pb <= 0) ? a : b;
}
//------------------------------------------------------------------------------
// Predictors
static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
(void)left;
return top[0];
}
static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
(void)left;
return top[1];
}
static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
(void)left;
return top[-1];
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
//------------------------------------------------------------------------------
static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
@@ -850,18 +735,33 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
(void)upper;
}
GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
// It subtracts the prediction from the input pixel and stores the residual
// in the output pixel.
#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = \
VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x); \
out[x] = VP8LSubPixels(in[x], pred); \
} \
}
GENERATE_PREDICTOR_SUB(2)
GENERATE_PREDICTOR_SUB(3)
GENERATE_PREDICTOR_SUB(4)
GENERATE_PREDICTOR_SUB(5)
GENERATE_PREDICTOR_SUB(6)
GENERATE_PREDICTOR_SUB(7)
GENERATE_PREDICTOR_SUB(8)
GENERATE_PREDICTOR_SUB(9)
GENERATE_PREDICTOR_SUB(10)
GENERATE_PREDICTOR_SUB(11)
GENERATE_PREDICTOR_SUB(12)
GENERATE_PREDICTOR_SUB(13)
//------------------------------------------------------------------------------
@@ -962,10 +862,10 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LEncDspInitSSE2();
#if defined(WEBP_USE_SSE41)
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8LEncDspInitSSE41();
}
@@ -989,7 +889,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LEncDspInitNEON();

View File

@@ -232,76 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
//------------------------------------------------------------------------------
// Entropy
// Checks whether the X or Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
} while (0)
// Checks whether the X + Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_XY(j) \
do { \
if (tmp[j] != 0) { \
retval -= VP8LFastSLog2(tmp[j]); \
ANALYZE_X_OR_Y(X, j); \
} \
} while (0)
// TODO(https://crbug.com/webp/499): this function produces different results
// from the C code due to use of double/float resulting in output differences
// when compared to -noasm.
#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX, sumXY;
int32_t tmp[4];
__m128i zero = _mm_setzero_si128();
// Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
__m128i sumXY_128 = zero;
__m128i sumX_128 = zero;
int sumX = 0, sumXY = 0;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i < 256; i += 4) {
const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
// Check if any X is non-zero: this actually provides a speedup as X is
// usually sparse.
if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
const __m128i xy_128 = _mm_add_epi32(x, y);
sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
sumX_128 = _mm_add_epi32(sumX_128, x);
// Analyze the different X + Y.
_mm_storeu_si128((__m128i*)tmp, xy_128);
ANALYZE_XY(0);
ANALYZE_XY(1);
ANALYZE_XY(2);
ANALYZE_XY(3);
} else {
// X is fully 0, so only deal with Y.
sumXY_128 = _mm_add_epi32(sumXY_128, y);
ANALYZE_X_OR_Y(Y, 0);
ANALYZE_X_OR_Y(Y, 1);
ANALYZE_X_OR_Y(Y, 2);
ANALYZE_X_OR_Y(Y, 3);
for (i = 0; i < 256; i += 16) {
const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0));
const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0));
const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4));
const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4));
const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8));
const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8));
const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
_mm_packs_epi32(x2, x3));
const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
_mm_packs_epi32(y2, y3));
const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
while (my) {
const int32_t j = BitsCtz(my);
int xy;
if ((mx >> j) & 1) {
const int x = X[i + j];
sumXY += x;
retval -= VP8LFastSLog2(x);
}
xy = X[i + j] + Y[i + j];
sumX += xy;
retval -= VP8LFastSLog2(xy);
my &= my - 1;
}
}
// Sum up sumX_128 to get sumX.
_mm_storeu_si128((__m128i*)tmp, sumX_128);
sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
// Sum up sumXY_128 to get sumXY.
_mm_storeu_si128((__m128i*)tmp, sumXY_128);
sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
#undef ANALYZE_X_OR_Y
#undef ANALYZE_XY
#else
#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster
#endif
//------------------------------------------------------------------------------
@@ -460,20 +439,22 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
(void)upper;
}
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
} \
}
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorSub##X##_SSE2(const uint32_t* const in, \
const uint32_t* const upper, \
int num_pixels, uint32_t* const out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \
num_pixels - i, out + i); \
} \
}
GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L
GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T
@@ -657,7 +638,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
VP8LAddVector = AddVector_SSE2;
VP8LAddVectorEq = AddVectorEq_SSE2;
#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
#endif
VP8LVectorMismatch = VectorMismatch_SSE2;
VP8LBundleColorMap = BundleColorMap_SSE2;

View File

@@ -44,46 +44,47 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
//------------------------------------------------------------------------------
// Color Transform
#define SPAN 8
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi16((short)0xff00); // green mask
const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask
const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask
const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
-1, -1, -1, -1, -1, -1, -1);
const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
2, -1, 6, -1, 10, -1, 14);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
const __m128i r = _mm_or_si128(r0, r1); // r 0
const __m128i gb0 = _mm_and_si128(in0, mask_gb);
const __m128i gb1 = _mm_and_si128(in1, mask_gb);
const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b
const __m128i g = _mm_and_si128(gb, mask_g); // g 0
const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg
const __m128i C = _mm_sub_epi8(gb, B); // x b'
const __m128i D = _mm_sub_epi8(C, A); // x b''
const __m128i E = _mm_and_si128(D, mask_b); // 0 b''
_mm_storeu_si128((__m128i*)values, E);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
const __m128i mult =
MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
const __m128i perm =
_mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
if (tile_width >= 4) {
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
const __m128i B1 = _mm_shuffle_epi8(A1, perm);
const __m128i C1 = _mm_mulhi_epi16(B1, mult);
const __m128i D1 = _mm_sub_epi16(A1, C1);
__m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
int x;
for (x = 4; x + 4 <= tile_width; x += 4) {
const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
__m128i B2, C2, D2;
++histo[_mm_extract_epi8(E, 0)];
B2 = _mm_shuffle_epi8(A2, perm);
++histo[_mm_extract_epi8(E, 4)];
C2 = _mm_mulhi_epi16(B2, mult);
++histo[_mm_extract_epi8(E, 8)];
D2 = _mm_sub_epi16(A2, C2);
++histo[_mm_extract_epi8(E, 12)];
E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
}
++histo[_mm_extract_epi8(E, 0)];
++histo[_mm_extract_epi8(E, 4)];
++histo[_mm_extract_epi8(E, 8)];
++histo[_mm_extract_epi8(E, 12)];
}
}
{
const int left_over = tile_width & (SPAN - 1);
const int left_over = tile_width & 3;
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
@@ -95,33 +96,37 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi16(0xff);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i g1 = _mm_and_si128(in1, mask_g);
const __m128i g = _mm_packus_epi32(g0, g1); // g 0
const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
const __m128i A1 = _mm_srli_epi32(in1, 16);
const __m128i A = _mm_packus_epi32(A0, A1); // x r
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr
const __m128i C = _mm_sub_epi8(A, B); // x r'
const __m128i D = _mm_and_si128(C, mask); // 0 r'
_mm_storeu_si128((__m128i*)values, D);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
if (tile_width >= 4) {
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
const __m128i B1 = _mm_and_si128(A1, mask_g);
const __m128i C1 = _mm_madd_epi16(B1, mult);
__m128i D = _mm_sub_epi16(A1, C1);
int x;
for (x = 4; x + 4 <= tile_width; x += 4) {
const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
__m128i B2, C2;
++histo[_mm_extract_epi8(D, 2)];
B2 = _mm_and_si128(A2, mask_g);
++histo[_mm_extract_epi8(D, 6)];
C2 = _mm_madd_epi16(B2, mult);
++histo[_mm_extract_epi8(D, 10)];
++histo[_mm_extract_epi8(D, 14)];
D = _mm_sub_epi16(A2, C2);
}
++histo[_mm_extract_epi8(D, 2)];
++histo[_mm_extract_epi8(D, 6)];
++histo[_mm_extract_epi8(D, 10)];
++histo[_mm_extract_epi8(D, 14)];
}
}
{
const int left_over = tile_width & (SPAN - 1);
const int left_over = tile_width & 3;
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height, green_to_red,
@@ -130,6 +135,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
}
}
#undef MK_CST_16
//------------------------------------------------------------------------------
// Entry point

View File

@@ -18,7 +18,6 @@
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------

View File

@@ -0,0 +1,132 @@
// Copyright 2021 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE41 variant of methods for lossless decoder
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include "src/dsp/common_sse41.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
//------------------------------------------------------------------------------
// Color-space conversion functions
static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
(CST(green_to_blue_) & 0xffff));
const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
#undef CST
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
-1, 9, -1, 9, -1, 13, -1, 13);
const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
-1, 10, -1, -1, -1, 14, -1, -1);
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
const __m128i C = _mm_mulhi_epi16(B, mults_rb);
const __m128i D = _mm_add_epi8(A, C);
const __m128i E = _mm_shuffle_epi8(D, perm2);
const __m128i F = _mm_mulhi_epi16(E, mults_b2);
const __m128i G = _mm_add_epi8(D, F);
const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
_mm_storeu_si128((__m128i*)&dst[i], out);
}
// Fall-back to C-version for left-overs.
if (i != num_pixels) {
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
}
}
//------------------------------------------------------------------------------
#define ARGB_TO_RGB_SSE41 do { \
while (num_pixels >= 16) { \
const __m128i in0 = _mm_loadu_si128(in + 0); \
const __m128i in1 = _mm_loadu_si128(in + 1); \
const __m128i in2 = _mm_loadu_si128(in + 2); \
const __m128i in3 = _mm_loadu_si128(in + 3); \
const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
_mm_storeu_si128(out + 0, b0); \
_mm_storeu_si128(out + 1, b1); \
_mm_storeu_si128(out + 2, b2); \
in += 4; \
out += 3; \
num_pixels -= 16; \
} \
} while (0)
static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
8, 14, 13, 12, -1, -1, -1, -1);
const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
ARGB_TO_RGB_SSE41;
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
}
static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
12, 13, 14, -1, -1, -1, -1);
const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
ARGB_TO_RGB_SSE41;
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
}
#undef ARGB_TO_RGB_SSE41
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
VP8LTransformColorInverse = TransformColorInverse_SSE41;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
#endif // WEBP_USE_SSE41

View File

@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
int x_out = channel;
// simple bilinear interpolation
int accum = wrk->x_add;
int left = src[x_in];
int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
rescaler_t left = (rescaler_t)src[x_in];
rescaler_t right =
(wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
x_in += x_stride;
while (1) {
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
left = right;
x_in += x_stride;
assert(x_in < wrk->src_width * x_stride);
right = src[x_in];
right = (rescaler_t)src[x_in];
accum += wrk->x_add;
}
}
@@ -213,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPRescalerDspInitSSE2();
}
@@ -235,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPRescalerDspInitNEON();

View File

@@ -150,7 +150,7 @@ WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}

View File

@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitYUV444ConvertersSSE2();
}
#endif
#if defined(WEBP_USE_SSE41)
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitYUV444ConvertersSSE41();
}
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitUpsamplersSSE2();
}
#endif
#if defined(WEBP_USE_SSE41)
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitUpsamplersSSE41();
}
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
#endif
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitUpsamplersNEON();

View File

@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitSamplersSSE2();
}
#endif // WEBP_USE_SSE2
#if defined(WEBP_USE_SSE41)
#endif // WEBP_HAVE_SSE2
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitSamplersSSE41();
}
#endif // WEBP_USE_SSE41
#endif // WEBP_HAVE_SSE41
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPInitSamplersMIPS32();
@@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_HAVE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitConvertARGBToYUVSSE2();
WebPInitSharpYUVSSE2();
}
#endif // WEBP_USE_SSE2
#if defined(WEBP_USE_SSE41)
#endif // WEBP_HAVE_SSE2
#if defined(WEBP_HAVE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitConvertARGBToYUVSSE41();
}
#endif // WEBP_USE_SSE41
#endif // WEBP_HAVE_SSE41
}
#if defined(WEBP_USE_NEON)
#if defined(WEBP_HAVE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
#endif // WEBP_USE_NEON
#endif // WEBP_HAVE_NEON
assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL);