You've already forked godot
mirror of
https://github.com/godotengine/godot.git
synced 2025-11-24 15:26:15 +00:00
libwebp: Sync with upstream 1.2.1
Changes: https://chromium.googlesource.com/webm/libwebp/+/1.2.1/NEWS
This commit is contained in:
73
thirdparty/libwebp/src/dsp/alpha_processing.c
vendored
73
thirdparty/libwebp/src/dsp/alpha_processing.c
vendored
@@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
|
||||
}
|
||||
}
|
||||
|
||||
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
|
||||
const uint8_t* WEBP_RESTRICT const alpha,
|
||||
int width, int inverse) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
@@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
#undef MFIX
|
||||
|
||||
void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
|
||||
void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
|
||||
const uint8_t* WEBP_RESTRICT const alpha,
|
||||
int width, int inverse);
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
|
||||
}
|
||||
}
|
||||
|
||||
void WebPMultRows(uint8_t* ptr, int stride,
|
||||
const uint8_t* alpha, int alpha_stride,
|
||||
void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
|
||||
const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
|
||||
int width, int num_rows, int inverse) {
|
||||
int n;
|
||||
for (n = 0; n < num_rows; ++n) {
|
||||
@@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
|
||||
}
|
||||
|
||||
#if !WEBP_NEON_OMIT_C_CODE
|
||||
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
|
||||
static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
|
||||
uint32_t alpha_mask = 0xff;
|
||||
int i, j;
|
||||
|
||||
@@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
|
||||
return (alpha_mask != 0xff);
|
||||
}
|
||||
|
||||
static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint32_t* dst, int dst_stride) {
|
||||
static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint32_t* WEBP_RESTRICT dst,
|
||||
int dst_stride) {
|
||||
int i, j;
|
||||
for (j = 0; j < height; ++j) {
|
||||
for (i = 0; i < width; ++i) {
|
||||
@@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
|
||||
static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
|
||||
uint8_t alpha_mask = 0xff;
|
||||
int i, j;
|
||||
|
||||
@@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
|
||||
return (alpha_mask == 0xff);
|
||||
}
|
||||
|
||||
static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
|
||||
static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT alpha, int size) {
|
||||
int i;
|
||||
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
|
||||
}
|
||||
@@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
|
||||
int x;
|
||||
for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Simple channel manipulations.
|
||||
|
||||
@@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
|
||||
}
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
|
||||
const uint8_t* b, int len, uint32_t* out) {
|
||||
static void PackARGB_C(const uint8_t* WEBP_RESTRICT a,
|
||||
const uint8_t* WEBP_RESTRICT r,
|
||||
const uint8_t* WEBP_RESTRICT g,
|
||||
const uint8_t* WEBP_RESTRICT b,
|
||||
int len, uint32_t* WEBP_RESTRICT out) {
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
|
||||
@@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
|
||||
}
|
||||
#endif
|
||||
|
||||
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out) {
|
||||
static void PackRGB_C(const uint8_t* WEBP_RESTRICT r,
|
||||
const uint8_t* WEBP_RESTRICT g,
|
||||
const uint8_t* WEBP_RESTRICT b,
|
||||
int len, int step, uint32_t* WEBP_RESTRICT out) {
|
||||
int i, offset = 0;
|
||||
for (i = 0; i < len; ++i) {
|
||||
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
|
||||
@@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
|
||||
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
|
||||
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
|
||||
int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
|
||||
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
|
||||
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
|
||||
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
|
||||
int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
|
||||
uint8_t* WEBP_RESTRICT, int);
|
||||
void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int,
|
||||
uint32_t* WEBP_RESTRICT, int);
|
||||
int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int,
|
||||
uint8_t* WEBP_RESTRICT, int);
|
||||
void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT alpha, int size);
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
|
||||
const uint8_t* b, int, uint32_t*);
|
||||
#endif
|
||||
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out);
|
||||
void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
|
||||
const uint8_t* WEBP_RESTRICT g,
|
||||
const uint8_t* WEBP_RESTRICT b,
|
||||
int len, int step, uint32_t* WEBP_RESTRICT out);
|
||||
|
||||
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
|
||||
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
|
||||
void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Init function
|
||||
@@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
|
||||
|
||||
WebPHasAlpha8b = HasAlpha8b_C;
|
||||
WebPHasAlpha32b = HasAlpha32b_C;
|
||||
WebPAlphaReplace = AlphaReplace_C;
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPInitAlphaProcessingSSE2();
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
WebPInitAlphaProcessingSSE41();
|
||||
}
|
||||
@@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitAlphaProcessingNEON();
|
||||
@@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
|
||||
assert(WebPPackRGB != NULL);
|
||||
assert(WebPHasAlpha8b != NULL);
|
||||
assert(WebPHasAlpha32b != NULL);
|
||||
assert(WebPAlphaReplace != NULL);
|
||||
}
|
||||
|
||||
@@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
|
||||
uint32_t alpha_mask = 0xffffffffu;
|
||||
uint8x8_t mask8 = vdup_n_u8(0xff);
|
||||
uint32_t tmp[2];
|
||||
@@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
|
||||
return (alpha_mask != 0xffffffffu);
|
||||
}
|
||||
|
||||
static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint32_t* dst, int dst_stride) {
|
||||
static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint32_t* WEBP_RESTRICT dst,
|
||||
int dst_stride) {
|
||||
int i, j;
|
||||
uint8x8x4_t greens; // leave A/R/B channels zero'd.
|
||||
greens.val[0] = vdup_n_u8(0);
|
||||
@@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
|
||||
static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
|
||||
uint32_t alpha_mask = 0xffffffffu;
|
||||
uint8x8_t mask8 = vdup_n_u8(0xff);
|
||||
uint32_t tmp[2];
|
||||
@@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
|
||||
return (alpha_mask == 0xffffffffu);
|
||||
}
|
||||
|
||||
static void ExtractGreen_NEON(const uint32_t* argb,
|
||||
uint8_t* alpha, int size) {
|
||||
static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT alpha, int size) {
|
||||
int i;
|
||||
for (i = 0; i + 16 <= size; i += 16) {
|
||||
const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
|
||||
|
||||
@@ -18,9 +18,9 @@
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride) {
|
||||
static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride) {
|
||||
// alpha_and stores an 'and' operation of all the alpha[] values. The final
|
||||
// value is not 0xff if any of the alpha[] is not equal to 0xff.
|
||||
uint32_t alpha_and = 0xff;
|
||||
@@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
|
||||
return (alpha_and != 0xff);
|
||||
}
|
||||
|
||||
static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint32_t* dst, int dst_stride) {
|
||||
static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint32_t* WEBP_RESTRICT dst,
|
||||
int dst_stride) {
|
||||
int i, j;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const int limit = width & ~15;
|
||||
@@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
|
||||
static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
|
||||
// alpha_and stores an 'and' operation of all the alpha[] values. The final
|
||||
// value is not 0xff if any of the alpha[] is not equal to 0xff.
|
||||
uint32_t alpha_and = 0xff;
|
||||
@@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
|
||||
const __m128i m_color = _mm_set1_epi32(color);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
int i = 0;
|
||||
for (; i + 8 <= length; i += 8) {
|
||||
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
|
||||
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
|
||||
const __m128i b0 = _mm_srai_epi32(a0, 24);
|
||||
const __m128i b1 = _mm_srai_epi32(a1, 24);
|
||||
const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
|
||||
const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
|
||||
const __m128i d0 = _mm_and_si128(c0, m_color);
|
||||
const __m128i d1 = _mm_and_si128(c1, m_color);
|
||||
const __m128i e0 = _mm_andnot_si128(c0, a0);
|
||||
const __m128i e1 = _mm_andnot_si128(c1, a1);
|
||||
_mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
|
||||
_mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
|
||||
}
|
||||
for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// Apply alpha value to rows
|
||||
|
||||
@@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
|
||||
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
|
||||
}
|
||||
|
||||
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr,
|
||||
const uint8_t* WEBP_RESTRICT const alpha,
|
||||
int width, int inverse) {
|
||||
int x = 0;
|
||||
if (!inverse) {
|
||||
@@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
|
||||
|
||||
WebPHasAlpha8b = HasAlpha8b_SSE2;
|
||||
WebPHasAlpha32b = HasAlpha32b_SSE2;
|
||||
WebPAlphaReplace = AlphaReplace_SSE2;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE2
|
||||
|
||||
@@ -19,9 +19,9 @@
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride) {
|
||||
static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
|
||||
int argb_stride, int width, int height,
|
||||
uint8_t* WEBP_RESTRICT alpha, int alpha_stride) {
|
||||
// alpha_and stores an 'and' operation of all the alpha[] values. The final
|
||||
// value is not 0xff if any of the alpha[] is not equal to 0xff.
|
||||
uint32_t alpha_and = 0xff;
|
||||
|
||||
4
thirdparty/libwebp/src/dsp/cost.c
vendored
4
thirdparty/libwebp/src/dsp/cost.c
vendored
@@ -395,12 +395,12 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
|
||||
VP8EncDspCostInitMIPSdspR2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8EncDspCostInitSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (VP8GetCPUInfo(kNEON)) {
|
||||
VP8EncDspCostInitNEON();
|
||||
}
|
||||
|
||||
45
thirdparty/libwebp/src/dsp/cpu.c
vendored
45
thirdparty/libwebp/src/dsp/cpu.c
vendored
@@ -55,12 +55,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
|
||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(info_type), "c"(0));
|
||||
}
|
||||
#elif (defined(_M_X64) || defined(_M_IX86)) && \
|
||||
defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
|
||||
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
|
||||
|
||||
#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
|
||||
#include <intrin.h>
|
||||
#define GetCPUInfo(info, type) __cpuidex(info, type, 0) // set ecx=0
|
||||
#elif defined(WEBP_MSC_SSE2)
|
||||
#define WEBP_HAVE_MSC_CPUID
|
||||
#elif _MSC_VER > 1310
|
||||
#include <intrin.h>
|
||||
#define GetCPUInfo __cpuid
|
||||
#define WEBP_HAVE_MSC_CPUID
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
// NaCl has no support for xgetbv or the raw opcode.
|
||||
@@ -94,7 +100,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
|
||||
#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
|
||||
|
||||
// helper function for run-time detection of slow SSSE3 platforms
|
||||
static int CheckSlowModel(int info) {
|
||||
@@ -179,9 +185,34 @@ static int AndroidCPUInfo(CPUFeature feature) {
|
||||
return 0;
|
||||
}
|
||||
VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
|
||||
#elif defined(WEBP_USE_NEON)
|
||||
// define a dummy function to enable turning off NEON at runtime by setting
|
||||
// VP8DecGetCPUInfo = NULL
|
||||
#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
|
||||
// Use compile flags as an indicator of SIMD support instead of a runtime check.
|
||||
static int wasmCPUInfo(CPUFeature feature) {
|
||||
switch (feature) {
|
||||
#ifdef WEBP_HAVE_SSE2
|
||||
case kSSE2:
|
||||
return 1;
|
||||
#endif
|
||||
#ifdef WEBP_HAVE_SSE41
|
||||
case kSSE3:
|
||||
case kSlowSSSE3:
|
||||
case kSSE4_1:
|
||||
return 1;
|
||||
#endif
|
||||
#ifdef WEBP_HAVE_NEON
|
||||
case kNEON:
|
||||
return 1;
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
|
||||
#elif defined(WEBP_HAVE_NEON)
|
||||
// In most cases this function doesn't check for NEON support (it's assumed by
|
||||
// the configuration), but enables turning off NEON at runtime, for testing
|
||||
// purposes, by setting VP8DecGetCPUInfo = NULL.
|
||||
static int armCPUInfo(CPUFeature feature) {
|
||||
if (feature != kNEON) return 0;
|
||||
#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
|
||||
|
||||
6
thirdparty/libwebp/src/dsp/dec.c
vendored
6
thirdparty/libwebp/src/dsp/dec.c
vendored
@@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8DspInitSSE2();
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
VP8DspInitSSE41();
|
||||
}
|
||||
@@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8DspInitNEON();
|
||||
|
||||
68
thirdparty/libwebp/src/dsp/dec_neon.c
vendored
68
thirdparty/libwebp/src/dsp/dec_neon.c
vendored
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) { // DC
|
||||
const uint8x8_t A = vld1_u8(dst - BPS); // top row
|
||||
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
||||
const uint16x4_t p1 = vpadd_u16(p0, p0);
|
||||
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
|
||||
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
|
||||
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
|
||||
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
|
||||
const uint16x8_t s0 = vaddq_u16(L0, L1);
|
||||
const uint16x8_t s1 = vaddq_u16(L2, L3);
|
||||
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
|
||||
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
|
||||
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
|
||||
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
|
||||
const uint16x8_t s0 = vaddl_u8(L0, L1);
|
||||
const uint16x8_t s1 = vaddl_u8(L2, L3);
|
||||
const uint16x8_t s01 = vaddq_u16(s0, s1);
|
||||
const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
|
||||
const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
|
||||
@@ -1429,8 +1429,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
|
||||
if (do_top) {
|
||||
const uint8x8_t A = vld1_u8(dst - BPS); // top row
|
||||
#if defined(__aarch64__)
|
||||
const uint16x8_t B = vmovl_u8(A);
|
||||
const uint16_t p2 = vaddvq_u16(B);
|
||||
const uint16_t p2 = vaddlv_u8(A);
|
||||
sum_top = vdupq_n_u16(p2);
|
||||
#else
|
||||
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
|
||||
@@ -1441,18 +1440,18 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
|
||||
}
|
||||
|
||||
if (do_left) {
|
||||
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
|
||||
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
|
||||
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
|
||||
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
|
||||
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
|
||||
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
|
||||
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
|
||||
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
|
||||
const uint16x8_t s0 = vaddq_u16(L0, L1);
|
||||
const uint16x8_t s1 = vaddq_u16(L2, L3);
|
||||
const uint16x8_t s2 = vaddq_u16(L4, L5);
|
||||
const uint16x8_t s3 = vaddq_u16(L6, L7);
|
||||
const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
|
||||
const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
|
||||
const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
|
||||
const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
|
||||
const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
|
||||
const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
|
||||
const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
|
||||
const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
|
||||
const uint16x8_t s0 = vaddl_u8(L0, L1);
|
||||
const uint16x8_t s1 = vaddl_u8(L2, L3);
|
||||
const uint16x8_t s2 = vaddl_u8(L4, L5);
|
||||
const uint16x8_t s3 = vaddl_u8(L6, L7);
|
||||
const uint16x8_t s01 = vaddq_u16(s0, s1);
|
||||
const uint16x8_t s23 = vaddq_u16(s2, s3);
|
||||
sum_left = vaddq_u16(s01, s23);
|
||||
@@ -1512,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
|
||||
|
||||
if (do_top) {
|
||||
const uint8x16_t A = vld1q_u8(dst - BPS); // top row
|
||||
#if defined(__aarch64__)
|
||||
const uint16_t p3 = vaddlvq_u8(A);
|
||||
sum_top = vdupq_n_u16(p3);
|
||||
#else
|
||||
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
|
||||
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
|
||||
const uint16x4_t p2 = vpadd_u16(p1, p1);
|
||||
const uint16x4_t p3 = vpadd_u16(p2, p2);
|
||||
sum_top = vcombine_u16(p3, p3);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (do_left) {
|
||||
int i;
|
||||
sum_left = vdupq_n_u16(0);
|
||||
for (i = 0; i < 16; i += 8) {
|
||||
const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
|
||||
const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
|
||||
const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
|
||||
const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
|
||||
const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
|
||||
const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
|
||||
const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
|
||||
const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
|
||||
const uint16x8_t s0 = vaddq_u16(L0, L1);
|
||||
const uint16x8_t s1 = vaddq_u16(L2, L3);
|
||||
const uint16x8_t s2 = vaddq_u16(L4, L5);
|
||||
const uint16x8_t s3 = vaddq_u16(L6, L7);
|
||||
const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
|
||||
const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
|
||||
const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
|
||||
const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
|
||||
const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
|
||||
const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
|
||||
const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
|
||||
const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
|
||||
const uint16x8_t s0 = vaddl_u8(L0, L1);
|
||||
const uint16x8_t s1 = vaddl_u8(L2, L3);
|
||||
const uint16x8_t s2 = vaddl_u8(L4, L5);
|
||||
const uint16x8_t s3 = vaddl_u8(L6, L7);
|
||||
const uint16x8_t s01 = vaddq_u16(s0, s1);
|
||||
const uint16x8_t s23 = vaddq_u16(s2, s3);
|
||||
const uint16x8_t sum = vaddq_u16(s01, s23);
|
||||
|
||||
106
thirdparty/libwebp/src/dsp/dsp.h
vendored
106
thirdparty/libwebp/src/dsp/dsp.h
vendored
@@ -26,6 +26,23 @@ extern "C" {
|
||||
|
||||
#define BPS 32 // this is the common stride for enc/dec
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// WEBP_RESTRICT
|
||||
|
||||
// Declares a pointer with the restrict type qualifier if available.
|
||||
// This allows code to hint to the compiler that only this pointer references a
|
||||
// particular object or memory region within the scope of the block in which it
|
||||
// is declared. This may allow for improved optimizations due to the lack of
|
||||
// pointer aliasing. See also:
|
||||
// https://en.cppreference.com/w/c/language/restrict
|
||||
#if defined(__GNUC__)
|
||||
#define WEBP_RESTRICT __restrict__
|
||||
#elif defined(_MSC_VER)
|
||||
#define WEBP_RESTRICT __restrict
|
||||
#else
|
||||
#define WEBP_RESTRICT
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// CPU detection
|
||||
|
||||
@@ -51,9 +68,7 @@ extern "C" {
|
||||
# define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
// for now, none of the optimizations below are available in emscripten
|
||||
#if !defined(EMSCRIPTEN)
|
||||
|
||||
#if !defined(HAVE_CONFIG_H)
|
||||
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
|
||||
(defined(_M_X64) || defined(_M_IX86))
|
||||
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
|
||||
@@ -63,23 +78,37 @@ extern "C" {
|
||||
(defined(_M_X64) || defined(_M_IX86))
|
||||
#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
|
||||
// files without intrinsics, allowing the corresponding Init() to be called.
|
||||
// Files containing intrinsics will need to be built targeting the instruction
|
||||
// set so should succeed on one of the earlier tests.
|
||||
#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
|
||||
#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \
|
||||
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2))
|
||||
#define WEBP_USE_SSE2
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
|
||||
#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2)
|
||||
#define WEBP_HAVE_SSE2
|
||||
#endif
|
||||
|
||||
#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \
|
||||
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41))
|
||||
#define WEBP_USE_SSE41
|
||||
#endif
|
||||
|
||||
#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41)
|
||||
#define WEBP_HAVE_SSE41
|
||||
#endif
|
||||
|
||||
#undef WEBP_MSC_SSE41
|
||||
#undef WEBP_MSC_SSE2
|
||||
|
||||
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
|
||||
// inline assembly would need to be modified for use with Native Client.
|
||||
#if (defined(__ARM_NEON__) || \
|
||||
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
|
||||
#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \
|
||||
(!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \
|
||||
!defined(__native_client__)
|
||||
#define WEBP_USE_NEON
|
||||
#endif
|
||||
@@ -95,6 +124,10 @@ extern "C" {
|
||||
#define WEBP_USE_INTRINSICS
|
||||
#endif
|
||||
|
||||
#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON)
|
||||
#define WEBP_HAVE_NEON
|
||||
#endif
|
||||
|
||||
#if defined(__mips__) && !defined(__mips64) && \
|
||||
defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
|
||||
#define WEBP_USE_MIPS32
|
||||
@@ -110,13 +143,11 @@ extern "C" {
|
||||
#define WEBP_USE_MSA
|
||||
#endif
|
||||
|
||||
#endif /* EMSCRIPTEN */
|
||||
|
||||
#ifndef WEBP_DSP_OMIT_C_CODE
|
||||
#define WEBP_DSP_OMIT_C_CODE 1
|
||||
#endif
|
||||
|
||||
#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
|
||||
#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE
|
||||
#define WEBP_NEON_OMIT_C_CODE 1
|
||||
#else
|
||||
#define WEBP_NEON_OMIT_C_CODE 0
|
||||
@@ -193,6 +224,12 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
|
||||
// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
|
||||
#if !defined(WEBP_OFFSET_PTR)
|
||||
#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
|
||||
#endif
|
||||
|
||||
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
|
||||
#if !defined(WEBP_SWAP_16BIT_CSP)
|
||||
#define WEBP_SWAP_16BIT_CSP 0
|
||||
@@ -572,26 +609,29 @@ extern void (*WebPApplyAlphaMultiply4444)(
|
||||
|
||||
// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
|
||||
// Returns true if alpha[] plane has non-trivial values different from 0xff.
|
||||
extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint8_t* dst, int dst_stride);
|
||||
extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint8_t* WEBP_RESTRICT dst, int dst_stride);
|
||||
|
||||
// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
|
||||
// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
|
||||
extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
|
||||
int width, int height,
|
||||
uint32_t* dst, int dst_stride);
|
||||
extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride, int width, int height,
|
||||
uint32_t* WEBP_RESTRICT dst,
|
||||
int dst_stride);
|
||||
|
||||
// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
|
||||
// (this is the opposite of WebPDispatchAlpha).
|
||||
// Returns true if there's only trivial 0xff alpha values.
|
||||
extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
|
||||
int width, int height,
|
||||
uint8_t* alpha, int alpha_stride);
|
||||
extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb,
|
||||
int argb_stride, int width, int height,
|
||||
uint8_t* WEBP_RESTRICT alpha,
|
||||
int alpha_stride);
|
||||
|
||||
// Extract the green values from 32b values in argb[] and pack them into alpha[]
|
||||
// (this is the opposite of WebPDispatchAlphaToGreen).
|
||||
extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
|
||||
extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb,
|
||||
uint8_t* WEBP_RESTRICT alpha, int size);
|
||||
|
||||
// Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B).
|
||||
// Un-Multiply operation transforms x into x * 255 / A.
|
||||
@@ -604,34 +644,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
|
||||
int inverse);
|
||||
|
||||
// Same for a row of single values, with side alpha values.
|
||||
extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr,
|
||||
const uint8_t* WEBP_RESTRICT const alpha,
|
||||
int width, int inverse);
|
||||
|
||||
// Same a WebPMultRow(), but for several 'num_rows' rows.
|
||||
void WebPMultRows(uint8_t* ptr, int stride,
|
||||
const uint8_t* alpha, int alpha_stride,
|
||||
void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride,
|
||||
const uint8_t* WEBP_RESTRICT alpha, int alpha_stride,
|
||||
int width, int num_rows, int inverse);
|
||||
|
||||
// Plain-C versions, used as fallback by some implementations.
|
||||
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
|
||||
void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr,
|
||||
const uint8_t* WEBP_RESTRICT const alpha,
|
||||
int width, int inverse);
|
||||
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
|
||||
|
||||
#ifdef WORDS_BIGENDIAN
|
||||
// ARGB packing function: a/r/g/b input is rgba or bgra order.
|
||||
extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
|
||||
const uint8_t* g, const uint8_t* b, int len,
|
||||
uint32_t* out);
|
||||
extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a,
|
||||
const uint8_t* WEBP_RESTRICT r,
|
||||
const uint8_t* WEBP_RESTRICT g,
|
||||
const uint8_t* WEBP_RESTRICT b,
|
||||
int len, uint32_t* WEBP_RESTRICT out);
|
||||
#endif
|
||||
|
||||
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
|
||||
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
|
||||
int len, int step, uint32_t* out);
|
||||
extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r,
|
||||
const uint8_t* WEBP_RESTRICT g,
|
||||
const uint8_t* WEBP_RESTRICT b,
|
||||
int len, int step, uint32_t* WEBP_RESTRICT out);
|
||||
|
||||
// This function returns true if src[i] contains a value different from 0xff.
|
||||
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
|
||||
// This function returns true if src[4*i] contains a value different from 0xff.
|
||||
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
|
||||
// replaces transparent values in src[] by 'color'.
|
||||
extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
|
||||
|
||||
// To be called first before using the above.
|
||||
void WebPInitAlphaProcessing(void);
|
||||
|
||||
6
thirdparty/libwebp/src/dsp/enc.c
vendored
6
thirdparty/libwebp/src/dsp/enc.c
vendored
@@ -773,10 +773,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8EncDspInitSSE2();
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
VP8EncDspInitSSE41();
|
||||
}
|
||||
@@ -800,7 +800,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8EncDspInitNEON();
|
||||
|
||||
4
thirdparty/libwebp/src/dsp/filters.c
vendored
4
thirdparty/libwebp/src/dsp/filters.c
vendored
@@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8FiltersInitSSE2();
|
||||
}
|
||||
@@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8FiltersInitNEON();
|
||||
|
||||
5
thirdparty/libwebp/src/dsp/filters_sse2.c
vendored
5
thirdparty/libwebp/src/dsp/filters_sse2.c
vendored
@@ -320,7 +320,12 @@ extern void VP8FiltersInitSSE2(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
|
||||
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
|
||||
#if defined(CHROMIUM)
|
||||
// TODO(crbug.com/654974)
|
||||
(void)VerticalUnfilter_SSE2;
|
||||
#else
|
||||
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
|
||||
#endif
|
||||
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
|
||||
|
||||
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
|
||||
|
||||
66
thirdparty/libwebp/src/dsp/lossless.c
vendored
66
thirdparty/libwebp/src/dsp/lossless.c
vendored
@@ -107,62 +107,62 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
|
||||
//------------------------------------------------------------------------------
|
||||
// Predictors
|
||||
|
||||
static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)top;
|
||||
(void)left;
|
||||
return ARGB_BLACK;
|
||||
}
|
||||
static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)top;
|
||||
return left;
|
||||
}
|
||||
static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[0];
|
||||
}
|
||||
static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[1];
|
||||
}
|
||||
static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[-1];
|
||||
}
|
||||
static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average3(left, top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[0]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[-1], top[0]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[0], top[1]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Select(top[0], left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
|
||||
uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
@@ -182,18 +182,18 @@ static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
|
||||
out[i] = left = VP8LAddPixels(in[i], left);
|
||||
}
|
||||
}
|
||||
GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
|
||||
GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
|
||||
GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -562,7 +562,6 @@ VP8LPredictorFunc VP8LPredictors[16];
|
||||
|
||||
// exposed plain-C implementations
|
||||
VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
|
||||
VP8LPredictorFunc VP8LPredictors_C[16];
|
||||
|
||||
VP8LTransformColorInverseFunc VP8LTransformColorInverse;
|
||||
|
||||
@@ -576,6 +575,7 @@ VP8LMapARGBFunc VP8LMapColor32b;
|
||||
VP8LMapAlphaFunc VP8LMapColor8b;
|
||||
|
||||
extern void VP8LDspInitSSE2(void);
|
||||
extern void VP8LDspInitSSE41(void);
|
||||
extern void VP8LDspInitNEON(void);
|
||||
extern void VP8LDspInitMIPSdspR2(void);
|
||||
extern void VP8LDspInitMSA(void);
|
||||
@@ -600,8 +600,7 @@ extern void VP8LDspInitMSA(void);
|
||||
} while (0);
|
||||
|
||||
WEBP_DSP_INIT_FUNC(VP8LDspInit) {
|
||||
COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
|
||||
COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
|
||||
COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
|
||||
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
|
||||
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
|
||||
|
||||
@@ -623,9 +622,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8LDspInitSSE2();
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
VP8LDspInitSSE41();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_MIPS_DSP_R2)
|
||||
@@ -640,7 +644,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8LDspInitNEON();
|
||||
|
||||
17
thirdparty/libwebp/src/dsp/lossless.h
vendored
17
thirdparty/libwebp/src/dsp/lossless.h
vendored
@@ -30,7 +30,22 @@ extern "C" {
|
||||
|
||||
typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
|
||||
extern VP8LPredictorFunc VP8LPredictors[16];
|
||||
extern VP8LPredictorFunc VP8LPredictors_C[16];
|
||||
|
||||
uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
|
||||
uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
|
||||
|
||||
// These Add/Sub function expects upper[-1] and out[-1] to be readable.
|
||||
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
|
||||
const uint32_t* upper, int num_pixels,
|
||||
|
||||
13
thirdparty/libwebp/src/dsp/lossless_common.h
vendored
13
thirdparty/libwebp/src/dsp/lossless_common.h
vendored
@@ -184,19 +184,6 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
|
||||
} \
|
||||
}
|
||||
|
||||
// It subtracts the prediction from the input pixel and stores the residual
|
||||
// in the output pixel.
|
||||
#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB) \
|
||||
static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
|
||||
int num_pixels, uint32_t* out) { \
|
||||
int x; \
|
||||
assert(upper != NULL); \
|
||||
for (x = 0; x < num_pixels; ++x) { \
|
||||
const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x); \
|
||||
out[x] = VP8LSubPixels(in[x], pred); \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
198
thirdparty/libwebp/src/dsp/lossless_enc.c
vendored
198
thirdparty/libwebp/src/dsp/lossless_enc.c
vendored
@@ -329,6 +329,15 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
|
||||
static float FastSLog2Slow_C(uint32_t v) {
|
||||
assert(v >= LOG_LOOKUP_IDX_MAX);
|
||||
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
|
||||
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
|
||||
// use clz if available
|
||||
const int log_cnt = BitsLog2Floor(v) - 7;
|
||||
const uint32_t y = 1 << log_cnt;
|
||||
int correction = 0;
|
||||
const float v_f = (float)v;
|
||||
const uint32_t orig_v = v;
|
||||
v >>= log_cnt;
|
||||
#else
|
||||
int log_cnt = 0;
|
||||
uint32_t y = 1;
|
||||
int correction = 0;
|
||||
@@ -339,6 +348,7 @@ static float FastSLog2Slow_C(uint32_t v) {
|
||||
v = v >> 1;
|
||||
y = y << 1;
|
||||
} while (v >= LOG_LOOKUP_IDX_MAX);
|
||||
#endif
|
||||
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
|
||||
// Xf = floor(Xf) * (1 + (v % y) / v)
|
||||
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
|
||||
@@ -355,6 +365,14 @@ static float FastSLog2Slow_C(uint32_t v) {
|
||||
static float FastLog2Slow_C(uint32_t v) {
|
||||
assert(v >= LOG_LOOKUP_IDX_MAX);
|
||||
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
|
||||
#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ)
|
||||
// use clz if available
|
||||
const int log_cnt = BitsLog2Floor(v) - 7;
|
||||
const uint32_t y = 1 << log_cnt;
|
||||
const uint32_t orig_v = v;
|
||||
double log_2;
|
||||
v >>= log_cnt;
|
||||
#else
|
||||
int log_cnt = 0;
|
||||
uint32_t y = 1;
|
||||
const uint32_t orig_v = v;
|
||||
@@ -364,6 +382,7 @@ static float FastLog2Slow_C(uint32_t v) {
|
||||
v = v >> 1;
|
||||
y = y << 1;
|
||||
} while (v >= LOG_LOOKUP_IDX_MAX);
|
||||
#endif
|
||||
log_2 = kLog2Table[v] + log_cnt;
|
||||
if (orig_v >= APPROX_LOG_MAX) {
|
||||
// Since the division is still expensive, add this correction factor only
|
||||
@@ -702,140 +721,6 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
|
||||
//------------------------------------------------------------------------------
|
||||
// Image transforms.
|
||||
|
||||
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
|
||||
return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
|
||||
return Average2(Average2(a0, a2), a1);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
|
||||
uint32_t a2, uint32_t a3) {
|
||||
return Average2(Average2(a0, a1), Average2(a2, a3));
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t Clip255(uint32_t a) {
|
||||
if (a < 256) {
|
||||
return a;
|
||||
}
|
||||
// return 0, when a is a negative integer.
|
||||
// return 255, when a is positive.
|
||||
return ~a >> 24;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
|
||||
return Clip255(a + b - c);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
|
||||
uint32_t c2) {
|
||||
const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
|
||||
const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
|
||||
(c1 >> 16) & 0xff,
|
||||
(c2 >> 16) & 0xff);
|
||||
const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
|
||||
(c1 >> 8) & 0xff,
|
||||
(c2 >> 8) & 0xff);
|
||||
const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
|
||||
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
|
||||
}
|
||||
|
||||
static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
|
||||
return Clip255(a + (a - b) / 2);
|
||||
}
|
||||
|
||||
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
|
||||
uint32_t c2) {
|
||||
const uint32_t ave = Average2(c0, c1);
|
||||
const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
|
||||
const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
|
||||
const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
|
||||
const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
|
||||
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
|
||||
}
|
||||
|
||||
// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
|
||||
#if defined(__arm__) && \
|
||||
(LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
|
||||
# define LOCAL_INLINE __attribute__ ((noinline))
|
||||
#else
|
||||
# define LOCAL_INLINE WEBP_INLINE
|
||||
#endif
|
||||
|
||||
static LOCAL_INLINE int Sub3(int a, int b, int c) {
|
||||
const int pb = b - c;
|
||||
const int pa = a - c;
|
||||
return abs(pb) - abs(pa);
|
||||
}
|
||||
|
||||
#undef LOCAL_INLINE
|
||||
|
||||
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
|
||||
const int pa_minus_pb =
|
||||
Sub3((a >> 24) , (b >> 24) , (c >> 24) ) +
|
||||
Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
|
||||
Sub3((a >> 8) & 0xff, (b >> 8) & 0xff, (c >> 8) & 0xff) +
|
||||
Sub3((a ) & 0xff, (b ) & 0xff, (c ) & 0xff);
|
||||
return (pa_minus_pb <= 0) ? a : b;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Predictors
|
||||
|
||||
static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[0];
|
||||
}
|
||||
static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[1];
|
||||
}
|
||||
static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
|
||||
(void)left;
|
||||
return top[-1];
|
||||
}
|
||||
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average3(left, top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(left, top[0]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[-1], top[0]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average2(top[0], top[1]);
|
||||
(void)left;
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = Select(top[0], left, top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
|
||||
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
|
||||
return pred;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
|
||||
int num_pixels, uint32_t* out) {
|
||||
int i;
|
||||
@@ -850,18 +735,33 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
|
||||
(void)upper;
|
||||
}
|
||||
|
||||
GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
|
||||
GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
|
||||
// It subtracts the prediction from the input pixel and stores the residual
|
||||
// in the output pixel.
|
||||
#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
|
||||
static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
|
||||
const uint32_t* upper, \
|
||||
int num_pixels, uint32_t* out) { \
|
||||
int x; \
|
||||
assert(upper != NULL); \
|
||||
for (x = 0; x < num_pixels; ++x) { \
|
||||
const uint32_t pred = \
|
||||
VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x); \
|
||||
out[x] = VP8LSubPixels(in[x], pred); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENERATE_PREDICTOR_SUB(2)
|
||||
GENERATE_PREDICTOR_SUB(3)
|
||||
GENERATE_PREDICTOR_SUB(4)
|
||||
GENERATE_PREDICTOR_SUB(5)
|
||||
GENERATE_PREDICTOR_SUB(6)
|
||||
GENERATE_PREDICTOR_SUB(7)
|
||||
GENERATE_PREDICTOR_SUB(8)
|
||||
GENERATE_PREDICTOR_SUB(9)
|
||||
GENERATE_PREDICTOR_SUB(10)
|
||||
GENERATE_PREDICTOR_SUB(11)
|
||||
GENERATE_PREDICTOR_SUB(12)
|
||||
GENERATE_PREDICTOR_SUB(13)
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -962,10 +862,10 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8LEncDspInitSSE2();
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
VP8LEncDspInitSSE41();
|
||||
}
|
||||
@@ -989,7 +889,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
VP8LEncDspInitNEON();
|
||||
|
||||
131
thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
vendored
131
thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
vendored
@@ -232,76 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
|
||||
//------------------------------------------------------------------------------
|
||||
// Entropy
|
||||
|
||||
// Checks whether the X or Y contribution is worth computing and adding.
|
||||
// Used in loop unrolling.
|
||||
#define ANALYZE_X_OR_Y(x_or_y, j) \
|
||||
do { \
|
||||
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
|
||||
} while (0)
|
||||
|
||||
// Checks whether the X + Y contribution is worth computing and adding.
|
||||
// Used in loop unrolling.
|
||||
#define ANALYZE_XY(j) \
|
||||
do { \
|
||||
if (tmp[j] != 0) { \
|
||||
retval -= VP8LFastSLog2(tmp[j]); \
|
||||
ANALYZE_X_OR_Y(X, j); \
|
||||
} \
|
||||
} while (0)
|
||||
// TODO(https://crbug.com/webp/499): this function produces different results
|
||||
// from the C code due to use of double/float resulting in output differences
|
||||
// when compared to -noasm.
|
||||
#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86))
|
||||
|
||||
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
|
||||
int i;
|
||||
double retval = 0.;
|
||||
int sumX, sumXY;
|
||||
int32_t tmp[4];
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
// Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
|
||||
__m128i sumXY_128 = zero;
|
||||
__m128i sumX_128 = zero;
|
||||
int sumX = 0, sumXY = 0;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (i = 0; i < 256; i += 4) {
|
||||
const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
|
||||
const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
|
||||
|
||||
// Check if any X is non-zero: this actually provides a speedup as X is
|
||||
// usually sparse.
|
||||
if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
|
||||
const __m128i xy_128 = _mm_add_epi32(x, y);
|
||||
sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
|
||||
|
||||
sumX_128 = _mm_add_epi32(sumX_128, x);
|
||||
|
||||
// Analyze the different X + Y.
|
||||
_mm_storeu_si128((__m128i*)tmp, xy_128);
|
||||
|
||||
ANALYZE_XY(0);
|
||||
ANALYZE_XY(1);
|
||||
ANALYZE_XY(2);
|
||||
ANALYZE_XY(3);
|
||||
} else {
|
||||
// X is fully 0, so only deal with Y.
|
||||
sumXY_128 = _mm_add_epi32(sumXY_128, y);
|
||||
|
||||
ANALYZE_X_OR_Y(Y, 0);
|
||||
ANALYZE_X_OR_Y(Y, 1);
|
||||
ANALYZE_X_OR_Y(Y, 2);
|
||||
ANALYZE_X_OR_Y(Y, 3);
|
||||
for (i = 0; i < 256; i += 16) {
|
||||
const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0));
|
||||
const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0));
|
||||
const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4));
|
||||
const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4));
|
||||
const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8));
|
||||
const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8));
|
||||
const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12));
|
||||
const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12));
|
||||
const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1),
|
||||
_mm_packs_epi32(x2, x3));
|
||||
const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1),
|
||||
_mm_packs_epi32(y2, y3));
|
||||
const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero));
|
||||
int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx;
|
||||
while (my) {
|
||||
const int32_t j = BitsCtz(my);
|
||||
int xy;
|
||||
if ((mx >> j) & 1) {
|
||||
const int x = X[i + j];
|
||||
sumXY += x;
|
||||
retval -= VP8LFastSLog2(x);
|
||||
}
|
||||
xy = X[i + j] + Y[i + j];
|
||||
sumX += xy;
|
||||
retval -= VP8LFastSLog2(xy);
|
||||
my &= my - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Sum up sumX_128 to get sumX.
|
||||
_mm_storeu_si128((__m128i*)tmp, sumX_128);
|
||||
sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
|
||||
// Sum up sumXY_128 to get sumXY.
|
||||
_mm_storeu_si128((__m128i*)tmp, sumXY_128);
|
||||
sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
|
||||
|
||||
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
|
||||
return (float)retval;
|
||||
}
|
||||
#undef ANALYZE_X_OR_Y
|
||||
#undef ANALYZE_XY
|
||||
|
||||
#else
|
||||
|
||||
#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster
|
||||
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@@ -460,20 +439,22 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
|
||||
(void)upper;
|
||||
}
|
||||
|
||||
#define GENERATE_PREDICTOR_1(X, IN) \
|
||||
static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
|
||||
int num_pixels, uint32_t* out) { \
|
||||
int i; \
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) { \
|
||||
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
|
||||
const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
|
||||
const __m128i res = _mm_sub_epi8(src, pred); \
|
||||
_mm_storeu_si128((__m128i*)&out[i], res); \
|
||||
} \
|
||||
if (i != num_pixels) { \
|
||||
VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
|
||||
} \
|
||||
}
|
||||
#define GENERATE_PREDICTOR_1(X, IN) \
|
||||
static void PredictorSub##X##_SSE2(const uint32_t* const in, \
|
||||
const uint32_t* const upper, \
|
||||
int num_pixels, uint32_t* const out) { \
|
||||
int i; \
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) { \
|
||||
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
|
||||
const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
|
||||
const __m128i res = _mm_sub_epi8(src, pred); \
|
||||
_mm_storeu_si128((__m128i*)&out[i], res); \
|
||||
} \
|
||||
if (i != num_pixels) { \
|
||||
VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \
|
||||
num_pixels - i, out + i); \
|
||||
} \
|
||||
}
|
||||
|
||||
GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L
|
||||
GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T
|
||||
@@ -657,7 +638,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
|
||||
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
|
||||
VP8LAddVector = AddVector_SSE2;
|
||||
VP8LAddVectorEq = AddVectorEq_SSE2;
|
||||
#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC)
|
||||
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
|
||||
#endif
|
||||
VP8LVectorMismatch = VectorMismatch_SSE2;
|
||||
VP8LBundleColorMap = BundleColorMap_SSE2;
|
||||
|
||||
|
||||
119
thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
vendored
119
thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
vendored
@@ -44,46 +44,47 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
|
||||
//------------------------------------------------------------------------------
|
||||
// Color Transform
|
||||
|
||||
#define SPAN 8
|
||||
#define MK_CST_16(HI, LO) \
|
||||
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
|
||||
|
||||
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_blue, int red_to_blue,
|
||||
int histo[]) {
|
||||
const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
|
||||
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
|
||||
const __m128i mask_g = _mm_set1_epi16((short)0xff00); // green mask
|
||||
const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask
|
||||
const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask
|
||||
const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1);
|
||||
const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
2, -1, 6, -1, 10, -1, 14);
|
||||
int y;
|
||||
for (y = 0; y < tile_height; ++y) {
|
||||
const uint32_t* const src = argb + y * stride;
|
||||
int i, x;
|
||||
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
|
||||
uint16_t values[SPAN];
|
||||
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
|
||||
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
|
||||
const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
|
||||
const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
|
||||
const __m128i r = _mm_or_si128(r0, r1); // r 0
|
||||
const __m128i gb0 = _mm_and_si128(in0, mask_gb);
|
||||
const __m128i gb1 = _mm_and_si128(in1, mask_gb);
|
||||
const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b
|
||||
const __m128i g = _mm_and_si128(gb, mask_g); // g 0
|
||||
const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr
|
||||
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg
|
||||
const __m128i C = _mm_sub_epi8(gb, B); // x b'
|
||||
const __m128i D = _mm_sub_epi8(C, A); // x b''
|
||||
const __m128i E = _mm_and_si128(D, mask_b); // 0 b''
|
||||
_mm_storeu_si128((__m128i*)values, E);
|
||||
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
|
||||
const __m128i mult =
|
||||
MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue));
|
||||
const __m128i perm =
|
||||
_mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14);
|
||||
if (tile_width >= 4) {
|
||||
int y;
|
||||
for (y = 0; y < tile_height; ++y) {
|
||||
const uint32_t* const src = argb + y * stride;
|
||||
const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
|
||||
const __m128i B1 = _mm_shuffle_epi8(A1, perm);
|
||||
const __m128i C1 = _mm_mulhi_epi16(B1, mult);
|
||||
const __m128i D1 = _mm_sub_epi16(A1, C1);
|
||||
__m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1);
|
||||
int x;
|
||||
for (x = 4; x + 4 <= tile_width; x += 4) {
|
||||
const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
|
||||
__m128i B2, C2, D2;
|
||||
++histo[_mm_extract_epi8(E, 0)];
|
||||
B2 = _mm_shuffle_epi8(A2, perm);
|
||||
++histo[_mm_extract_epi8(E, 4)];
|
||||
C2 = _mm_mulhi_epi16(B2, mult);
|
||||
++histo[_mm_extract_epi8(E, 8)];
|
||||
D2 = _mm_sub_epi16(A2, C2);
|
||||
++histo[_mm_extract_epi8(E, 12)];
|
||||
E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2);
|
||||
}
|
||||
++histo[_mm_extract_epi8(E, 0)];
|
||||
++histo[_mm_extract_epi8(E, 4)];
|
||||
++histo[_mm_extract_epi8(E, 8)];
|
||||
++histo[_mm_extract_epi8(E, 12)];
|
||||
}
|
||||
}
|
||||
{
|
||||
const int left_over = tile_width & (SPAN - 1);
|
||||
const int left_over = tile_width & 3;
|
||||
if (left_over > 0) {
|
||||
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
|
||||
left_over, tile_height,
|
||||
@@ -95,33 +96,37 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
|
||||
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
|
||||
int tile_width, int tile_height,
|
||||
int green_to_red, int histo[]) {
|
||||
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
|
||||
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
|
||||
const __m128i mask = _mm_set1_epi16(0xff);
|
||||
|
||||
int y;
|
||||
for (y = 0; y < tile_height; ++y) {
|
||||
const uint32_t* const src = argb + y * stride;
|
||||
int i, x;
|
||||
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
|
||||
uint16_t values[SPAN];
|
||||
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
|
||||
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
|
||||
const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
|
||||
const __m128i g1 = _mm_and_si128(in1, mask_g);
|
||||
const __m128i g = _mm_packus_epi32(g0, g1); // g 0
|
||||
const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
|
||||
const __m128i A1 = _mm_srli_epi32(in1, 16);
|
||||
const __m128i A = _mm_packus_epi32(A0, A1); // x r
|
||||
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr
|
||||
const __m128i C = _mm_sub_epi8(A, B); // x r'
|
||||
const __m128i D = _mm_and_si128(C, mask); // 0 r'
|
||||
_mm_storeu_si128((__m128i*)values, D);
|
||||
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
|
||||
const __m128i mult = MK_CST_16(0, CST_5b(green_to_red));
|
||||
const __m128i mask_g = _mm_set1_epi32(0x0000ff00);
|
||||
if (tile_width >= 4) {
|
||||
int y;
|
||||
for (y = 0; y < tile_height; ++y) {
|
||||
const uint32_t* const src = argb + y * stride;
|
||||
const __m128i A1 = _mm_loadu_si128((const __m128i*)src);
|
||||
const __m128i B1 = _mm_and_si128(A1, mask_g);
|
||||
const __m128i C1 = _mm_madd_epi16(B1, mult);
|
||||
__m128i D = _mm_sub_epi16(A1, C1);
|
||||
int x;
|
||||
for (x = 4; x + 4 <= tile_width; x += 4) {
|
||||
const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x));
|
||||
__m128i B2, C2;
|
||||
++histo[_mm_extract_epi8(D, 2)];
|
||||
B2 = _mm_and_si128(A2, mask_g);
|
||||
++histo[_mm_extract_epi8(D, 6)];
|
||||
C2 = _mm_madd_epi16(B2, mult);
|
||||
++histo[_mm_extract_epi8(D, 10)];
|
||||
++histo[_mm_extract_epi8(D, 14)];
|
||||
D = _mm_sub_epi16(A2, C2);
|
||||
}
|
||||
++histo[_mm_extract_epi8(D, 2)];
|
||||
++histo[_mm_extract_epi8(D, 6)];
|
||||
++histo[_mm_extract_epi8(D, 10)];
|
||||
++histo[_mm_extract_epi8(D, 14)];
|
||||
}
|
||||
}
|
||||
{
|
||||
const int left_over = tile_width & (SPAN - 1);
|
||||
const int left_over = tile_width & 3;
|
||||
if (left_over > 0) {
|
||||
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
|
||||
left_over, tile_height, green_to_red,
|
||||
@@ -130,6 +135,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
|
||||
}
|
||||
}
|
||||
|
||||
#undef MK_CST_16
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
|
||||
1
thirdparty/libwebp/src/dsp/lossless_sse2.c
vendored
1
thirdparty/libwebp/src/dsp/lossless_sse2.c
vendored
@@ -18,7 +18,6 @@
|
||||
#include "src/dsp/common_sse2.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
132
thirdparty/libwebp/src/dsp/lossless_sse41.c
vendored
Normal file
132
thirdparty/libwebp/src/dsp/lossless_sse41.c
vendored
Normal file
@@ -0,0 +1,132 @@
|
||||
// Copyright 2021 Google Inc. All Rights Reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style license
|
||||
// that can be found in the COPYING file in the root of the source
|
||||
// tree. An additional intellectual property rights grant can be found
|
||||
// in the file PATENTS. All contributing project authors may
|
||||
// be found in the AUTHORS file in the root of the source tree.
|
||||
// -----------------------------------------------------------------------------
|
||||
//
|
||||
// SSE41 variant of methods for lossless decoder
|
||||
|
||||
#include "src/dsp/dsp.h"
|
||||
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
|
||||
#include "src/dsp/common_sse41.h"
|
||||
#include "src/dsp/lossless.h"
|
||||
#include "src/dsp/lossless_common.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Color-space conversion functions
|
||||
|
||||
static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
|
||||
const uint32_t* const src,
|
||||
int num_pixels, uint32_t* dst) {
|
||||
// sign-extended multiplying constants, pre-shifted by 5.
|
||||
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
|
||||
const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
|
||||
(CST(green_to_blue_) & 0xffff));
|
||||
const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
|
||||
#undef CST
|
||||
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
|
||||
const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
|
||||
-1, 9, -1, 9, -1, 13, -1, 13);
|
||||
const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
|
||||
-1, 10, -1, -1, -1, 14, -1, -1);
|
||||
int i;
|
||||
for (i = 0; i + 4 <= num_pixels; i += 4) {
|
||||
const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
|
||||
const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
|
||||
const __m128i C = _mm_mulhi_epi16(B, mults_rb);
|
||||
const __m128i D = _mm_add_epi8(A, C);
|
||||
const __m128i E = _mm_shuffle_epi8(D, perm2);
|
||||
const __m128i F = _mm_mulhi_epi16(E, mults_b2);
|
||||
const __m128i G = _mm_add_epi8(D, F);
|
||||
const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
|
||||
_mm_storeu_si128((__m128i*)&dst[i], out);
|
||||
}
|
||||
// Fall-back to C-version for left-overs.
|
||||
if (i != num_pixels) {
|
||||
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
#define ARGB_TO_RGB_SSE41 do { \
|
||||
while (num_pixels >= 16) { \
|
||||
const __m128i in0 = _mm_loadu_si128(in + 0); \
|
||||
const __m128i in1 = _mm_loadu_si128(in + 1); \
|
||||
const __m128i in2 = _mm_loadu_si128(in + 2); \
|
||||
const __m128i in3 = _mm_loadu_si128(in + 3); \
|
||||
const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \
|
||||
const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \
|
||||
const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \
|
||||
const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \
|
||||
const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
|
||||
const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
|
||||
const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
|
||||
_mm_storeu_si128(out + 0, b0); \
|
||||
_mm_storeu_si128(out + 1, b1); \
|
||||
_mm_storeu_si128(out + 2, b2); \
|
||||
in += 4; \
|
||||
out += 3; \
|
||||
num_pixels -= 16; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels,
|
||||
uint8_t* dst) {
|
||||
const __m128i* in = (const __m128i*)src;
|
||||
__m128i* out = (__m128i*)dst;
|
||||
const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
|
||||
8, 14, 13, 12, -1, -1, -1, -1);
|
||||
const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
|
||||
const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
|
||||
const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
|
||||
|
||||
ARGB_TO_RGB_SSE41;
|
||||
|
||||
// left-overs
|
||||
if (num_pixels > 0) {
|
||||
VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
|
||||
}
|
||||
}
|
||||
|
||||
static void ConvertBGRAToBGR_SSE41(const uint32_t* src,
|
||||
int num_pixels, uint8_t* dst) {
|
||||
const __m128i* in = (const __m128i*)src;
|
||||
__m128i* out = (__m128i*)dst;
|
||||
const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
|
||||
12, 13, 14, -1, -1, -1, -1);
|
||||
const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
|
||||
const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
|
||||
const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
|
||||
|
||||
ARGB_TO_RGB_SSE41;
|
||||
|
||||
// left-overs
|
||||
if (num_pixels > 0) {
|
||||
VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
|
||||
}
|
||||
}
|
||||
|
||||
#undef ARGB_TO_RGB_SSE41
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Entry point
|
||||
|
||||
extern void VP8LDspInitSSE41(void);
|
||||
|
||||
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
|
||||
VP8LTransformColorInverse = TransformColorInverse_SSE41;
|
||||
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
|
||||
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
|
||||
}
|
||||
|
||||
#else // !WEBP_USE_SSE41
|
||||
|
||||
WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
|
||||
|
||||
#endif // WEBP_USE_SSE41
|
||||
11
thirdparty/libwebp/src/dsp/rescaler.c
vendored
11
thirdparty/libwebp/src/dsp/rescaler.c
vendored
@@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
|
||||
int x_out = channel;
|
||||
// simple bilinear interpolation
|
||||
int accum = wrk->x_add;
|
||||
int left = src[x_in];
|
||||
int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
|
||||
rescaler_t left = (rescaler_t)src[x_in];
|
||||
rescaler_t right =
|
||||
(wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left;
|
||||
x_in += x_stride;
|
||||
while (1) {
|
||||
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
|
||||
@@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
|
||||
left = right;
|
||||
x_in += x_stride;
|
||||
assert(x_in < wrk->src_width * x_stride);
|
||||
right = src[x_in];
|
||||
right = (rescaler_t)src[x_in];
|
||||
accum += wrk->x_add;
|
||||
}
|
||||
}
|
||||
@@ -213,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
|
||||
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPRescalerDspInitSSE2();
|
||||
}
|
||||
@@ -235,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPRescalerDspInitNEON();
|
||||
|
||||
2
thirdparty/libwebp/src/dsp/ssim.c
vendored
2
thirdparty/libwebp/src/dsp/ssim.c
vendored
@@ -150,7 +150,7 @@ WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
VP8SSIMDspInitSSE2();
|
||||
}
|
||||
|
||||
10
thirdparty/libwebp/src/dsp/upsampling.c
vendored
10
thirdparty/libwebp/src/dsp/upsampling.c
vendored
@@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
|
||||
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPInitYUV444ConvertersSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
WebPInitYUV444ConvertersSSE41();
|
||||
}
|
||||
@@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPInitUpsamplersSSE2();
|
||||
}
|
||||
#endif
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
WebPInitUpsamplersSSE41();
|
||||
}
|
||||
@@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitUpsamplersNEON();
|
||||
|
||||
20
thirdparty/libwebp/src/dsp/yuv.c
vendored
20
thirdparty/libwebp/src/dsp/yuv.c
vendored
@@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
|
||||
|
||||
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPInitSamplersSSE2();
|
||||
}
|
||||
#endif // WEBP_USE_SSE2
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#endif // WEBP_HAVE_SSE2
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
WebPInitSamplersSSE41();
|
||||
}
|
||||
#endif // WEBP_USE_SSE41
|
||||
#endif // WEBP_HAVE_SSE41
|
||||
#if defined(WEBP_USE_MIPS32)
|
||||
if (VP8GetCPUInfo(kMIPS32)) {
|
||||
WebPInitSamplersMIPS32();
|
||||
@@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
|
||||
#endif
|
||||
|
||||
if (VP8GetCPUInfo != NULL) {
|
||||
#if defined(WEBP_USE_SSE2)
|
||||
#if defined(WEBP_HAVE_SSE2)
|
||||
if (VP8GetCPUInfo(kSSE2)) {
|
||||
WebPInitConvertARGBToYUVSSE2();
|
||||
WebPInitSharpYUVSSE2();
|
||||
}
|
||||
#endif // WEBP_USE_SSE2
|
||||
#if defined(WEBP_USE_SSE41)
|
||||
#endif // WEBP_HAVE_SSE2
|
||||
#if defined(WEBP_HAVE_SSE41)
|
||||
if (VP8GetCPUInfo(kSSE4_1)) {
|
||||
WebPInitConvertARGBToYUVSSE41();
|
||||
}
|
||||
#endif // WEBP_USE_SSE41
|
||||
#endif // WEBP_HAVE_SSE41
|
||||
}
|
||||
|
||||
#if defined(WEBP_USE_NEON)
|
||||
#if defined(WEBP_HAVE_NEON)
|
||||
if (WEBP_NEON_OMIT_C_CODE ||
|
||||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
|
||||
WebPInitConvertARGBToYUVNEON();
|
||||
WebPInitSharpYUVNEON();
|
||||
}
|
||||
#endif // WEBP_USE_NEON
|
||||
#endif // WEBP_HAVE_NEON
|
||||
|
||||
assert(WebPConvertARGBToY != NULL);
|
||||
assert(WebPConvertARGBToUV != NULL);
|
||||
|
||||
Reference in New Issue
Block a user