diff --git a/thirdparty/README.md b/thirdparty/README.md
index 90f516090d4..4898294be50 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -662,7 +662,7 @@ Patches:
 ## meshoptimizer
 
 - Upstream: https://github.com/zeux/meshoptimizer
-- Version: 0.22 (4affad044571506a5724c9a6f15424f43e86f731, 2024)
+- Version: 0.23 (3e9d1ff3135794f519f3237515277c8d9a3fd3f2, 2025)
 - License: MIT
 
 Files extracted from upstream repository:
@@ -670,10 +670,6 @@ Files extracted from upstream repository:
 - All files in `src/`
 - `LICENSE.md`
 
-Patches:
-
-- `0001-simplifier-distance-only-error.patch` (GH-98529)
-
 
 ## mingw-std-threads
 
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
index ef9f5919f27..a5c3b1ccca7 100644
--- a/thirdparty/meshoptimizer/LICENSE.md
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016-2024 Arseny Kapoulkine
+Copyright (c) 2016-2025 Arseny Kapoulkine
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
index 738add5f2fe..26d2fb11c51 100644
--- a/thirdparty/meshoptimizer/clusterizer.cpp
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -13,12 +13,16 @@
 namespace meshopt
 {
 
-// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
-const size_t kMeshletMaxVertices = 255;
+// This must be <= 256 since meshlet indices are stored as bytes
+const size_t kMeshletMaxVertices = 256;
 
 // A reasonable limit is around 2*max_vertices or less
 const size_t kMeshletMaxTriangles = 512;
 
+// We keep a limited number of seed triangles and add a few triangles per finished meshlet
+const size_t kMeshletMaxSeeds = 256;
+const size_t kMeshletAddSeeds = 4;
+
 struct TriangleAdjacency2
 {
 	unsigned int* counts;
@@ -70,72 +74,152 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
 		assert(adjacency.offsets[i] >= adjacency.counts[i]);
-
 		adjacency.offsets[i] -= adjacency.counts[i];
 	}
 }
 
-static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices
+	const unsigned int sparse_seen = 1u << 31;
+	assert(index_count < sparse_seen);
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	for (size_t i = 0; i < index_count; ++i)
+		assert(indices[i] < vertex_count);
+
+	for (size_t i = 0; i < index_count; ++i)
+		adjacency.counts[indices[i]] = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+		adjacency.counts[indices[i]]++;
+
+	// fill offset table; uses sparse_seen bit to tag visited vertices
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int v = indices[i];
+
+		if ((adjacency.counts[v] & sparse_seen) == 0)
+		{
+			adjacency.offsets[v] = offset;
+			offset += adjacency.counts[v];
+			adjacency.counts[v] |= sparse_seen;
+		}
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	// also fix counts (that were marked with sparse_seen by the first pass)
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int v = indices[i];
+
+		if (adjacency.counts[v] & sparse_seen)
+		{
+			adjacency.counts[v] &= ~sparse_seen;
+
+			assert(adjacency.offsets[v] >= adjacency.counts[v]);
+			adjacency.offsets[v] -= adjacency.counts[v];
+		}
+	}
+}
+
+static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride)
 {
 	assert(count > 0);
 
+	size_t points_stride_float = points_stride / sizeof(float);
+	size_t radii_stride_float = radii_stride / sizeof(float);
+
 	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
 	size_t pmin[3] = {0, 0, 0};
 	size_t pmax[3] = {0, 0, 0};
 
 	for (size_t i = 0; i < count; ++i)
 	{
-		const float* p = points[i];
+		const float* p = points + i * points_stride_float;
+		float r = radii[i * radii_stride_float];
 
 		for (int axis = 0; axis < 3; ++axis)
 		{
-			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
-			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+			float bmin = points[pmin[axis] * points_stride_float + axis] - radii[pmin[axis] * radii_stride_float];
+			float bmax = points[pmax[axis] * points_stride_float + axis] + radii[pmax[axis] * radii_stride_float];
+
+			pmin[axis] = (p[axis] - r < bmin) ? i : pmin[axis];
+			pmax[axis] = (p[axis] + r > bmax) ? i : pmax[axis];
 		}
 	}
 
 	// find the pair of points with largest distance
-	float paxisd2 = 0;
 	int paxis = 0;
+	float paxisdr = 0;
 
 	for (int axis = 0; axis < 3; ++axis)
 	{
-		const float* p1 = points[pmin[axis]];
-		const float* p2 = points[pmax[axis]];
+		const float* p1 = points + pmin[axis] * points_stride_float;
+		const float* p2 = points + pmax[axis] * points_stride_float;
+		float r1 = radii[pmin[axis] * radii_stride_float];
+		float r2 = radii[pmax[axis] * radii_stride_float];
 
 		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+		float dr = sqrtf(d2) + r1 + r2;
 
-		if (d2 > paxisd2)
+		if (dr > paxisdr)
 		{
-			paxisd2 = d2;
+			paxisdr = dr;
 			paxis = axis;
 		}
 	}
 
 	// use the longest segment as the initial sphere diameter
-	const float* p1 = points[pmin[paxis]];
-	const float* p2 = points[pmax[paxis]];
+	const float* p1 = points + pmin[paxis] * points_stride_float;
+	const float* p2 = points + pmax[paxis] * points_stride_float;
+	float r1 = radii[pmin[paxis] * radii_stride_float];
+	float r2 = radii[pmax[paxis] * radii_stride_float];
 
-	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
-	float radius = sqrtf(paxisd2) / 2;
+	float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]));
+	float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f;
+
+	float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk};
+	float radius = paxisdr / 2;
 
 	// iteratively adjust the sphere up until all points fit
 	for (size_t i = 0; i < count; ++i)
 	{
-		const float* p = points[i];
+		const float* p = points + i * points_stride_float;
+		float r = radii[i * radii_stride_float];
+
 		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+		float d = sqrtf(d2);
 
-		if (d2 > radius * radius)
+		if (d + r > radius)
 		{
-			float d = sqrtf(d2);
-			assert(d > 0);
+			float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f;
 
-			float k = 0.5f + (radius / d) / 2;
-
-			center[0] = center[0] * k + p[0] * (1 - k);
-			center[1] = center[1] * k + p[1] * (1 - k);
-			center[2] = center[2] * k + p[2] * (1 - k);
-			radius = (radius + d) / 2;
+			center[0] += k * (p[0] - center[0]);
+			center[1] += k * (p[1] - center[1]);
+			center[2] += k * (p[2] - center[2]);
+			radius = (radius + d + r) / 2;
 		}
 	}
 
@@ -151,12 +235,25 @@ struct Cone
 	float nx, ny, nz;
 };
 
-static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+static float getDistance(float dx, float dy, float dz, bool aa)
 {
+	if (!aa)
+		return sqrtf(dx * dx + dy * dy + dz * dz);
+
+	float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz);
+	float rxy = rx > ry ? rx : ry;
+	return rxy > rz ? rxy : rz;
+}
+
+static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius)
+{
+	if (cone_weight < 0)
+		return 1 + distance / expected_radius;
+
 	float cone = 1.f - spread * cone_weight;
 	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
 
-	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+	return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped;
 }
 
 static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
@@ -230,22 +327,22 @@ static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_trian
 		meshlet_triangles[offset++] = 0;
 }
 
-static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false)
 {
-	unsigned char& av = used[a];
-	unsigned char& bv = used[b];
-	unsigned char& cv = used[c];
+	short& av = used[a];
+	short& bv = used[b];
+	short& cv = used[c];
 
 	bool result = false;
 
-	int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+	int used_extra = (av < 0) + (bv < 0) + (cv < 0);
 
-	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split)
 	{
 		meshlets[meshlet_offset] = meshlet;
 
 		for (size_t j = 0; j < meshlet.vertex_count; ++j)
-			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
+			used[meshlet_vertices[meshlet.vertex_offset + j]] = -1;
 
 		finishMeshlet(meshlet, meshlet_triangles);
 
@@ -257,33 +354,33 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
 		result = true;
 	}
 
-	if (av == 0xff)
+	if (av < 0)
 	{
-		av = (unsigned char)meshlet.vertex_count;
+		av = short(meshlet.vertex_count);
 		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
 	}
 
-	if (bv == 0xff)
+	if (bv < 0)
 	{
-		bv = (unsigned char)meshlet.vertex_count;
+		bv = short(meshlet.vertex_count);
 		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
 	}
 
-	if (cv == 0xff)
+	if (cv < 0)
 	{
-		cv = (unsigned char)meshlet.vertex_count;
+		cv = short(meshlet.vertex_count);
 		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
 	}
 
-	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
-	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
-	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv;
 	meshlet.triangle_count++;
 
 	return result;
 }
 
-static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight)
+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight)
 {
 	unsigned int best_triangle = ~0u;
 	int best_priority = 5;
@@ -301,7 +398,7 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 			unsigned int triangle = neighbors[j];
 			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
 
-			int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+			int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0);
 			assert(extra <= 2);
 
 			int priority = -1;
@@ -323,27 +420,13 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 			if (priority > best_priority)
 				continue;
 
-			float score = 0;
+			const Cone& tri_cone = triangles[triangle];
 
-			// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
-			if (meshlet_cone)
-			{
-				const Cone& tri_cone = triangles[triangle];
+			float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz;
+			float distance = getDistance(dx, dy, dz, cone_weight < 0);
+			float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
 
-				float distance2 =
-				    (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
-				    (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
-				    (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
-
-				float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
-
-				score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
-			}
-			else
-			{
-				// each live_triangles entry is >= 1 since it includes the current triangle we're processing
-				score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
-			}
+			float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius);
 
 			// note that topology-based priority is always more important than the score
 			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
@@ -359,6 +442,113 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co
 	return best_triangle;
 }
 
+static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+	unsigned int best_seeds[kMeshletAddSeeds];
+	unsigned int best_live[kMeshletAddSeeds];
+	float best_score[kMeshletAddSeeds];
+
+	for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+	{
+		best_seeds[i] = ~0u;
+		best_live[i] = ~0u;
+		best_score[i] = FLT_MAX;
+	}
+
+	for (size_t i = 0; i < meshlet.vertex_count; ++i)
+	{
+		unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+		unsigned int best_neighbor = ~0u;
+		unsigned int best_neighbor_live = ~0u;
+
+		// find the neighbor with the smallest live metric
+		unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+		size_t neighbors_size = adjacency.counts[index];
+
+		for (size_t j = 0; j < neighbors_size; ++j)
+		{
+			unsigned int triangle = neighbors[j];
+			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+			unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+
+			if (live < best_neighbor_live)
+			{
+				best_neighbor = triangle;
+				best_neighbor_live = live;
+			}
+		}
+
+		// add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate
+		if (best_neighbor == ~0u)
+			continue;
+
+		float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false);
+
+		for (size_t j = 0; j < kMeshletAddSeeds; ++j)
+		{
+			// non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices)
+			if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j]))
+			{
+				best_seeds[j] = best_neighbor;
+				best_live[j] = best_neighbor_live;
+				best_score[j] = best_neighbor_score;
+				break;
+			}
+		}
+	}
+
+	// add surviving seeds to the meshlet
+	size_t seed_count = 0;
+
+	for (size_t i = 0; i < kMeshletAddSeeds; ++i)
+		if (best_seeds[i] != ~0u)
+			seeds[seed_count++] = best_seeds[i];
+
+	return seed_count;
+}
+
+static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags)
+{
+	size_t result = 0;
+
+	for (size_t i = 0; i < seed_count; ++i)
+	{
+		unsigned int index = seeds[i];
+
+		seeds[result] = index;
+		result += emitted_flags[index] == 0;
+	}
+
+	return result;
+}
+
+static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz)
+{
+	unsigned int best_seed = ~0u;
+	unsigned int best_live = ~0u;
+	float best_score = FLT_MAX;
+
+	for (size_t i = 0; i < seed_count; ++i)
+	{
+		unsigned int index = seeds[i];
+		unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2];
+
+		unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c];
+		float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false);
+
+		if (live < best_live || (live == best_live && score < best_score))
+		{
+			best_seed = index;
+			best_live = live;
+			best_score = score;
+		}
+	}
+
+	return best_seed;
+}
+
 struct KDNode
 {
 	union
@@ -467,7 +657,7 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const
 	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
 }
 
-static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit)
 {
 	const KDNode& node = nodes[root];
 
@@ -483,11 +673,8 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 
 			const float* point = points + index * stride;
 
-			float distance2 =
-			    (point[0] - position[0]) * (point[0] - position[0]) +
-			    (point[1] - position[1]) * (point[1] - position[1]) +
-			    (point[2] - position[2]) * (point[2] - position[2]);
-			float distance = sqrtf(distance2);
+			float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2];
+			float distance = getDistance(dx, dy, dz, aa);
 
 			if (distance < limit)
 			{
@@ -503,11 +690,11 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points,
 		unsigned int first = (delta <= 0) ? 0 : node.children;
 		unsigned int second = first ^ node.children;
 
-		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit);
 
 		// only process the other node if it can have a match based on closest distance so far
 		if (fabsf(delta) <= limit)
-			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit);
 	}
 }
 
@@ -535,7 +722,7 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
 }
 
-size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
 {
 	using namespace meshopt;
 
@@ -544,18 +731,25 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
-	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
-	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+	assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles);
+	assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
-	assert(cone_weight >= 0 && cone_weight <= 1);
+	assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets
+	assert(split_factor >= 0);
+
+	if (index_count == 0)
+		return 0;
 
 	meshopt_Allocator allocator;
 
 	TriangleAdjacency2 adjacency = {};
-	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+	if (vertex_count > index_count && index_count < (1u << 31))
+		buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator);
+	else
+		buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
 
-	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
-	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+	// live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match
+	unsigned int* live_triangles = adjacency.counts;
 
 	size_t face_count = index_count / 3;
 
@@ -578,9 +772,42 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
 	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
 
-	// index of the vertex in the meshlet, 0xff if the vertex isn't used
-	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
-	memset(used, -1, vertex_count);
+	// find a specific corner of the mesh to use as a starting point for meshlet flow
+	float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		const Cone& tri = triangles[i];
+
+		cornerx = cornerx > tri.px ? tri.px : cornerx;
+		cornery = cornery > tri.py ? tri.py : cornery;
+		cornerz = cornerz > tri.pz ? tri.pz : cornerz;
+	}
+
+	// index of the vertex in the meshlet, -1 if the vertex isn't used
+	short* used = allocator.allocate<short>(vertex_count);
+	memset(used, -1, vertex_count * sizeof(short));
+
+	// initial seed triangle is the one closest to the corner
+	unsigned int initial_seed = ~0u;
+	float initial_score = FLT_MAX;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		const Cone& tri = triangles[i];
+
+		float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false);
+
+		if (initial_seed == ~0u || score < initial_score)
+		{
+			initial_seed = unsigned(i);
+			initial_score = score;
+		}
+	}
+
+	// seed triangles to continue meshlet flow
+	unsigned int seeds[kMeshletMaxSeeds] = {};
+	size_t seed_count = 0;
 
 	meshopt_Meshlet meshlet = {};
 	size_t meshlet_offset = 0;
@@ -591,46 +818,61 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	{
 		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
-		int best_extra = best_triangle == ~0u ? -1 : (used[indices[best_triangle * 3 + 0]] == 0xff) + (used[indices[best_triangle * 3 + 1]] == 0xff) + (used[indices[best_triangle * 3 + 2]] == 0xff);
+		unsigned int best_triangle = ~0u;
 
-		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
-		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
-		{
-			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f);
-		}
+		// for the first triangle, we don't have a meshlet cone yet, so we use the initial seed
+		// to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring
+		if (meshlet_offset == 0 && meshlet.triangle_count == 0)
+			best_triangle = initial_seed;
+		else
+			best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight);
 
-		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
+		bool split = false;
+
+		// when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
 		if (best_triangle == ~0u)
 		{
 			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
 			unsigned int index = ~0u;
-			float limit = FLT_MAX;
+			float distance = FLT_MAX;
 
-			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance);
 
 			best_triangle = index;
+			split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor;
 		}
 
 		if (best_triangle == ~0u)
 			break;
 
+		int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0);
+
+		// if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow
+		if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
+		{
+			seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags);
+			seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds;
+			seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz);
+
+			unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz);
+
+			// we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency
+			best_triangle = best_seed != ~0u ? best_seed : best_triangle;
+		}
+
 		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
 		assert(a < vertex_count && b < vertex_count && c < vertex_count);
 
 		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
-		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split))
 		{
 			meshlet_offset++;
 			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
 		}
 
-		live_triangles[a]--;
-		live_triangles[b]--;
-		live_triangles[c]--;
-
 		// remove emitted triangle from adjacency data
 		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		// live triangle counts are updated as a byproduct of these adjustments
 		for (size_t k = 0; k < 3; ++k)
 		{
 			unsigned int index = indices[best_triangle * 3 + k];
@@ -659,6 +901,7 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 		meshlet_cone_acc.ny += triangles[best_triangle].ny;
 		meshlet_cone_acc.nz += triangles[best_triangle].nz;
 
+		assert(!emitted_flags[best_triangle]);
 		emitted_flags[best_triangle] = 1;
 	}
 
@@ -669,10 +912,17 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 		meshlets[meshlet_offset++] = meshlet;
 	}
 
-	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles));
 	return meshlet_offset;
 }
 
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+	assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex
+
+	return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f);
+}
+
 size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
 {
 	using namespace meshopt;
@@ -685,9 +935,9 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle
 
 	meshopt_Allocator allocator;
 
-	// index of the vertex in the meshlet, 0xff if the vertex isn't used
-	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
-	memset(used, -1, vertex_count);
+	// index of the vertex in the meshlet, -1 if the vertex isn't used
+	short* used = allocator.allocate<short>(vertex_count);
+	memset(used, -1, vertex_count * sizeof(short));
 
 	meshopt_Meshlet meshlet = {};
 	size_t meshlet_offset = 0;
@@ -768,15 +1018,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	if (triangles == 0)
 		return bounds;
 
+	const float rzero = 0.f;
+
 	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
 	float psphere[4] = {};
-	computeBoundingSphere(psphere, corners[0], triangles * 3);
+	computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0);
 
 	float center[3] = {psphere[0], psphere[1], psphere[2]};
 
 	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
 	float nsphere[4] = {};
-	computeBoundingSphere(nsphere, normals, triangles);
+	computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0);
 
 	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
 	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
@@ -886,6 +1138,33 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
 	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
 }
 
+meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride)
+{
+	using namespace meshopt;
+
+	assert(positions_stride >= 12 && positions_stride <= 256);
+	assert(positions_stride % sizeof(float) == 0);
+	assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL);
+	assert(radii_stride % sizeof(float) == 0);
+
+	meshopt_Bounds bounds = {};
+
+	if (count == 0)
+		return bounds;
+
+	const float rzero = 0.f;
+
+	float psphere[4] = {};
+	computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0);
+
+	bounds.center[0] = psphere[0];
+	bounds.center[1] = psphere[1];
+	bounds.center[2] = psphere[2];
+	bounds.radius = psphere[3];
+
+	return bounds;
+}
+
 void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count)
 {
 	using namespace meshopt;
@@ -953,23 +1232,23 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh
 	// reorder meshlet vertices for access locality assuming index buffer is scanned sequentially
 	unsigned int order[kMeshletMaxVertices];
 
-	unsigned char remap[kMeshletMaxVertices];
-	memset(remap, -1, vertex_count);
+	short remap[kMeshletMaxVertices];
+	memset(remap, -1, vertex_count * sizeof(short));
 
 	size_t vertex_offset = 0;
 
 	for (size_t i = 0; i < triangle_count * 3; ++i)
 	{
-		unsigned char& r = remap[indices[i]];
+		short& r = remap[indices[i]];
 
-		if (r == 0xff)
+		if (r < 0)
 		{
-			r = (unsigned char)(vertex_offset);
+			r = short(vertex_offset);
 			order[vertex_offset] = vertices[indices[i]];
 			vertex_offset++;
 		}
 
-		indices[i] = r;
+		indices[i] = (unsigned char)r;
 	}
 
 	assert(vertex_offset <= vertex_count);
diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp
index b3004600523..b4fdfe16d5c 100644
--- a/thirdparty/meshoptimizer/indexcodec.cpp
+++ b/thirdparty/meshoptimizer/indexcodec.cpp
@@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0;
 const unsigned char kSequenceHeader = 0xd0;
 
 static int gEncodeIndexVersion = 1;
+const int kDecodeIndexVersion = 1;
 
 typedef unsigned int VertexFifo[16];
 typedef unsigned int EdgeFifo[16][2];
@@ -354,11 +355,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
 
 void meshopt_encodeIndexVersion(int version)
 {
-	assert(unsigned(version) <= 1);
+	assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion));
 
 	meshopt::gEncodeIndexVersion = version;
 }
 
+int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+	if (buffer_size < 1)
+		return -1;
+
+	unsigned char header = buffer[0];
+
+	if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader)
+		return -1;
+
+	int version = header & 0x0f;
+	if (version > meshopt::kDecodeIndexVersion)
+		return -1;
+
+	return version;
+}
+
 int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
 {
 	using namespace meshopt;
@@ -374,7 +392,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde
 		return -1;
 
 	int version = buffer[0] & 0x0f;
-	if (version > 1)
+	if (version > kDecodeIndexVersion)
 		return -1;
 
 	EdgeFifo edgefifo;
@@ -627,7 +645,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in
 		return -1;
 
 	int version = buffer[0] & 0x0f;
-	if (version > 1)
+	if (version > kDecodeIndexVersion)
 		return -1;
 
 	const unsigned char* data = buffer + 1;
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
index 77be5371fc7..295324c784d 100644
--- a/thirdparty/meshoptimizer/meshoptimizer.h
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.22
+ * meshoptimizer - version 0.23
  *
- * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
  *
  * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 220 /* 0.22 */
+#define MESHOPTIMIZER_VERSION 230 /* 0.23 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -243,6 +243,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version);
  */
 MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
 
+/**
+ * Get encoded index format version
+ * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size);
+
 /**
  * Index sequence encoder
  * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
@@ -277,9 +284,19 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
 MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
 
+/**
+ * Experimental: Vertex buffer encoder
+ * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level.
+ * For compression level to take effect, the vertex encoding version must be set to 1 via meshopt_encodeVertexVersion.
+ * The default compression level implied by meshopt_encodeVertexBuffer is 2.
+ *
+ * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio.
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level);
+
 /**
  * Set vertex encoder format version
- * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+)
  */
 MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
 
@@ -293,6 +310,13 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version);
  */
 MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
 
+/**
+ * Get encoded vertex format version
+ * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid
+ * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed.
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size);
+
 /**
  * Vertex buffer filters
  * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
@@ -334,7 +358,7 @@ enum meshopt_EncodeExpMode
 	meshopt_EncodeExpSharedVector,
 	/* When encoding exponents, use shared value for each component of all vectors (best compression) */
 	meshopt_EncodeExpSharedComponent,
-	/* Experimental: When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
+	/* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */
 	meshopt_EncodeExpClamped,
 };
 
@@ -375,7 +399,7 @@ enum
 MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
 /**
- * Experimental: Mesh simplifier with attribute metric
+ * Mesh simplifier with attribute metric
  * The algorithm enhances meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
  * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
  *
@@ -384,7 +408,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
  * attribute_count must be <= 32
  * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
 /**
  * Experimental: Mesh simplifier (sloppy)
@@ -402,7 +426,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* d
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
 
 /**
- * Experimental: Point cloud simplifier
+ * Point cloud simplifier
  * Reduces the number of points in the cloud to reach the given target
  * Returns the number of points after simplification, with destination containing new index data
  * The resulting index buffer references vertices from the original vertex buffer.
@@ -410,10 +434,10 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
  *
  * destination must contain enough space for the target index buffer (target_vertex_count elements)
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
+ * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex
  * color_weight determines relative priority of color wrt position; 1.0 is a safe default
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
+MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count);
 
 /**
  * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
@@ -520,7 +544,7 @@ struct meshopt_Meshlet
  * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
  * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
  * vertex_positions should have float3 position in the first 12 bytes of each vertex
- * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4)
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; max_triangles must be divisible by 4)
  * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  */
 MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
@@ -528,14 +552,30 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl
 MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
 
 /**
- * Experimental: Meshlet optimizer
+ * Experimental: Meshlet builder with flexible cluster sizes
+ * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet.
+ * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor.
+ * Additionally, allows to switch to axis aligned clusters by setting cone_weight to a negative value.
+ *
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!)
+ * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
+ * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency; additionally, cone_weight can be set to a negative value to prioritize axis aligned clusters (for raytracing) instead
+ * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+
+/**
+ * Meshlet optimizer
  * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput
  *
  * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these
  * need to be computed from meshlet's vertex_offset and triangle_offset
- * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512)
+ * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512)
  */
-MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
+MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count);
 
 struct meshopt_Bounds
 {
@@ -579,6 +619,27 @@ struct meshopt_Bounds
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
+/**
+ * Experimental: Sphere bounds generator
+ * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0.
+ *
+ * positions should have float3 position in the first 12 bytes of each element
+ * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element
+ */
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride);
+
+/**
+ * Experimental: Cluster partitioner
+ * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices.
+ *
+ * destination must contain enough space for the resulting partiotion data (cluster_count elements)
+ * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function
+ * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially
+ * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count
+ * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size);
+
 /**
  * Spatial sorter
  * Generates a remap table that can be used to reorder points for spatial locality.
@@ -598,34 +659,6 @@ MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const
  */
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
-/**
- * Set allocation callbacks
- * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
- * Note that all algorithms only allocate memory for temporary use.
- * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
- */
-MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-/* Quantization into commonly supported data formats */
-#ifdef __cplusplus
-/**
- * Quantize a float in [0..1] range into an N-bit fixed point unorm value
- * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
- * Maximum reconstruction error: 1/2^(N+1)
- */
-inline int meshopt_quantizeUnorm(float v, int N);
-
-/**
- * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
- * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
- * Maximum reconstruction error: 1/2^N
- */
-inline int meshopt_quantizeSnorm(float v, int N);
-
 /**
  * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value
  * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
@@ -646,6 +679,34 @@ MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N);
  * Preserves Inf/NaN, flushes denormals to zero
  */
 MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h);
+
+/**
+ * Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
 #endif
 
 /**
@@ -714,8 +775,12 @@ inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* mes
 template <typename T>
 inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 template <typename T>
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor);
+template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size);
+template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 #endif
 
@@ -1094,6 +1159,14 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int*
 	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }
 
+template <typename T>
+inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor)
+{
+	meshopt_IndexAdapter<T> in(NULL, indices, index_count);
+
+	return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor);
+}
+
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
@@ -1102,6 +1175,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde
 	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
 }
 
+template <typename T>
+inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size)
+{
+	meshopt_IndexAdapter<T> in(NULL, cluster_indices, total_index_count);
+
+	return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_count, target_partition_size);
+}
+
 template <typename T>
 inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
@@ -1113,7 +1194,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif
 
 /**
- * Copyright (c) 2016-2024 Arseny Kapoulkine
+ * Copyright (c) 2016-2025 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/thirdparty/meshoptimizer/partition.cpp b/thirdparty/meshoptimizer/partition.cpp
new file mode 100644
index 00000000000..9c229980552
--- /dev/null
+++ b/thirdparty/meshoptimizer/partition.cpp
@@ -0,0 +1,429 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+namespace meshopt
+{
+
+struct ClusterAdjacency
+{
+	unsigned int* offsets;
+	unsigned int* clusters;
+	unsigned int* shared;
+};
+
+static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, unsigned char* used, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	unsigned int* ref_offsets = allocator.allocate<unsigned int>(vertex_count + 1);
+
+	// compute number of clusters referenced by each vertex
+	memset(ref_offsets, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+			assert(v < vertex_count);
+
+			ref_offsets[v] += 1 - used[v];
+			used[v] = 1;
+		}
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			used[cluster_indices[j]] = 0;
+	}
+
+	// compute (worst-case) number of adjacent clusters for each cluster
+	size_t total_adjacency = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		size_t count = 0;
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+			assert(v < vertex_count);
+
+			// worst case is every vertex has a disjoint cluster list
+			count += used[v] ? 0 : ref_offsets[v] - 1;
+			used[v] = 1;
+		}
+
+		// ... but only every other cluster can be adjacent in the end
+		total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1;
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			used[cluster_indices[j]] = 0;
+	}
+
+	// we can now allocate adjacency buffers
+	adjacency.offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+	adjacency.clusters = allocator.allocate<unsigned int>(total_adjacency);
+	adjacency.shared = allocator.allocate<unsigned int>(total_adjacency);
+
+	// convert ref counts to offsets
+	size_t total_refs = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		size_t count = ref_offsets[i];
+		ref_offsets[i] = unsigned(total_refs);
+		total_refs += count;
+	}
+
+	unsigned int* ref_data = allocator.allocate<unsigned int>(total_refs);
+
+	// fill cluster refs for each vertex
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+			assert(v < vertex_count);
+
+			if (used[v])
+				continue;
+
+			ref_data[ref_offsets[v]++] = unsigned(i);
+			used[v] = 1;
+		}
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			used[cluster_indices[j]] = 0;
+	}
+
+	// after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start
+	memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int));
+	ref_offsets[0] = 0;
+
+	// fill cluster adjacency for each cluster...
+	adjacency.offsets[0] = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		unsigned int* adj = adjacency.clusters + adjacency.offsets[i];
+		unsigned int* shd = adjacency.shared + adjacency.offsets[i];
+		size_t count = 0;
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+			assert(v < vertex_count);
+
+			if (used[v])
+				continue;
+
+			// merge the entire cluster list of each vertex into current list
+			for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k)
+			{
+				unsigned int c = ref_data[k];
+				assert(c < cluster_count);
+
+				if (c == unsigned(i))
+					continue;
+
+				// if the cluster is already in the list, increment the shared count
+				bool found = false;
+				for (size_t l = 0; l < count; ++l)
+					if (adj[l] == c)
+					{
+						found = true;
+						shd[l]++;
+						break;
+					}
+
+				// .. or append a new cluster
+				if (!found)
+				{
+					adj[count] = c;
+					shd[count] = 1;
+					count++;
+				}
+			}
+
+			used[v] = 1;
+		}
+
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			used[cluster_indices[j]] = 0;
+
+		// mark the end of the adjacency list; the next cluster will start there as well
+		adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count);
+	}
+
+	assert(adjacency.offsets[cluster_count] <= total_adjacency);
+
+	// ref_offsets can't be deallocated as it was allocated before adjacency
+	allocator.deallocate(ref_data);
+}
+
+struct ClusterGroup
+{
+	int group;
+	int next;
+	unsigned int size; // 0 unless root
+	unsigned int vertices;
+};
+
+struct GroupOrder
+{
+	unsigned int id;
+	int order;
+};
+
+static void heapPush(GroupOrder* heap, size_t size, GroupOrder item)
+{
+	// insert a new element at the end (breaks heap invariant)
+	heap[size++] = item;
+
+	// bubble up the new element to its correct position
+	size_t i = size - 1;
+	while (i > 0 && heap[i].order < heap[(i - 1) / 2].order)
+	{
+		size_t p = (i - 1) / 2;
+
+		GroupOrder temp = heap[i];
+		heap[i] = heap[p];
+		heap[p] = temp;
+		i = p;
+	}
+}
+
+static GroupOrder heapPop(GroupOrder* heap, size_t size)
+{
+	assert(size > 0);
+	GroupOrder top = heap[0];
+
+	// move the last element to the top (breaks heap invariant)
+	heap[0] = heap[--size];
+
+	// bubble down the new top element to its correct position
+	size_t i = 0;
+	while (i * 2 + 1 < size)
+	{
+		// find the smallest child
+		size_t j = i * 2 + 1;
+		j += (j + 1 < size && heap[j + 1].order < heap[j].order);
+
+		// if the parent is already smaller than both children, we're done
+		if (heap[j].order >= heap[i].order)
+			break;
+
+		// otherwise, swap the parent and child and continue
+		GroupOrder temp = heap[i];
+		heap[i] = heap[j];
+		heap[j] = temp;
+		i = j;
+	}
+
+	return top;
+}
+
+static unsigned int countTotal(const ClusterGroup* groups, int id, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, unsigned char* used)
+{
+	unsigned int total = 0;
+
+	for (int i = id; i >= 0; i = groups[i].next)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+		{
+			unsigned int v = cluster_indices[j];
+			total += 1 - used[v];
+			used[v] = 1;
+		}
+	}
+
+	for (int i = id; i >= 0; i = groups[i].next)
+	{
+		for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j)
+			used[cluster_indices[j]] = 0;
+	}
+
+	return total;
+}
+
+static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency)
+{
+	unsigned int total = 0;
+
+	for (int i1 = group1; i1 >= 0; i1 = groups[i1].next)
+		for (int i2 = group2; i2 >= 0; i2 = groups[i2].next)
+		{
+			for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj)
+				if (adjacency.clusters[adj] == unsigned(i2))
+				{
+					total += adjacency.shared[adj];
+					break;
+				}
+		}
+
+	return total;
+}
+
+static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size)
+{
+	assert(groups[id].size > 0);
+
+	float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices)));
+
+	int best_group = -1;
+	float best_score = 0;
+
+	for (int ci = id; ci >= 0; ci = groups[ci].next)
+	{
+		for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj)
+		{
+			int other = groups[adjacency.clusters[adj]].group;
+			if (other < 0)
+				continue;
+
+			assert(groups[other].size > 0);
+			if (groups[id].size + groups[other].size > max_partition_size)
+				continue;
+
+			unsigned int shared = countShared(groups, id, other, adjacency);
+			float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices)));
+
+			// normalize shared count by the expected boundary of each group (+ keeps scoring symmetric)
+			float score = float(int(shared)) * (group_rsqrt + other_rsqrt);
+
+			if (score > best_score)
+			{
+				best_group = other;
+				best_score = score;
+			}
+		}
+	}
+
+	return best_group;
+}
+
+} // namespace meshopt
+
+size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size)
+{
+	using namespace meshopt;
+
+	assert(target_partition_size > 0);
+
+	size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8;
+
+	meshopt_Allocator allocator;
+
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, 0, vertex_count);
+
+	// build cluster index offsets as a prefix sum
+	unsigned int* cluster_offsets = allocator.allocate<unsigned int>(cluster_count + 1);
+	unsigned int cluster_nextoffset = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		assert(cluster_index_counts[i] > 0);
+
+		cluster_offsets[i] = cluster_nextoffset;
+		cluster_nextoffset += cluster_index_counts[i];
+	}
+
+	assert(cluster_nextoffset == total_index_count);
+	cluster_offsets[cluster_count] = unsigned(total_index_count);
+
+	// build cluster adjacency along with edge weights (shared vertex count)
+	ClusterAdjacency adjacency = {};
+	buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, used, vertex_count, allocator);
+
+	ClusterGroup* groups = allocator.allocate<ClusterGroup>(cluster_count);
+
+	GroupOrder* order = allocator.allocate<GroupOrder>(cluster_count);
+	size_t pending = 0;
+
+	// create a singleton group for each cluster and order them by priority
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		groups[i].group = int(i);
+		groups[i].next = -1;
+		groups[i].size = 1;
+		groups[i].vertices = countTotal(groups, int(i), cluster_indices, cluster_offsets, used);
+
+		GroupOrder item = {};
+		item.id = unsigned(i);
+		item.order = groups[i].vertices;
+
+		heapPush(order, pending++, item);
+	}
+
+	// iteratively merge the smallest group with the best group
+	while (pending)
+	{
+		GroupOrder top = heapPop(order, pending--);
+
+		// this group was merged into another group earlier
+		if (groups[top.id].size == 0)
+			continue;
+
+		// disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted
+		for (int i = top.id; i >= 0; i = groups[i].next)
+		{
+			assert(groups[i].group == int(top.id));
+			groups[i].group = -1;
+		}
+
+		// the group is large enough, emit as is
+		if (groups[top.id].size >= target_partition_size)
+			continue;
+
+		int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size);
+
+		// we can't grow the group any more, emit as is
+		if (best_group == -1)
+			continue;
+
+		// compute shared vertices to adjust the total vertices estimate after merging
+		unsigned int shared = countShared(groups, top.id, best_group, adjacency);
+
+		// combine groups by linking them together
+		assert(groups[best_group].size > 0);
+
+		for (int i = top.id; i >= 0; i = groups[i].next)
+			if (groups[i].next < 0)
+			{
+				groups[i].next = best_group;
+				break;
+			}
+
+		// update group sizes; note, the vertex update is an approximation which avoids recomputing the true size via countTotal
+		groups[top.id].size += groups[best_group].size;
+		groups[top.id].vertices += groups[best_group].vertices;
+		groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1;
+
+		groups[best_group].size = 0;
+		groups[best_group].vertices = 0;
+
+		// re-associate all clusters back to the merged group
+		for (int i = top.id; i >= 0; i = groups[i].next)
+			groups[i].group = int(top.id);
+
+		top.order = groups[top.id].vertices;
+		heapPush(order, pending++, top);
+	}
+
+	size_t next_group = 0;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		if (groups[i].size == 0)
+			continue;
+
+		for (int j = int(i); j >= 0; j = groups[j].next)
+			destination[j] = unsigned(next_group);
+
+		next_group++;
+	}
+
+	assert(next_group <= cluster_count);
+	return next_group;
+}
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
index af64cbda49d..cf0a8a1878f 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -437,8 +437,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 	{
 		// vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges
 		for (size_t i = 0; i < vertex_count; ++i)
-			if (vertex_lock[sparse_remap ? sparse_remap[i] : i])
+		{
+			unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i);
+			assert(vertex_lock[ri] <= 1); // values other than 0/1 are reserved for future use
+
+			if (vertex_lock[ri])
 				result[remap[i]] = Kind_Locked;
+		}
 
 		for (size_t i = 0; i < vertex_count; ++i)
 			if (result[remap[i]] == Kind_Locked)
@@ -1026,7 +1031,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c
 	return collapse_count;
 }
 
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
 {
 	for (size_t i = 0; i < collapse_count; ++i)
 	{
@@ -1041,7 +1046,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 		unsigned int j1 = c.bidi ? i0 : i1;
 
 		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
-		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
+		float ej = c.bidi ? quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]) : FLT_MAX;
 
 #if TRACE >= 3
 		float di = ei, dj = ej;
@@ -1049,9 +1054,25 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 
 		if (attribute_count)
 		{
-			// note: ideally we would evaluate max/avg of attribute errors for seam edges, but it's not clear if it's worth the extra cost
 			ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
-			ej += quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
+			ej += c.bidi ? quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]) : 0;
+
+			// note: seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate
+			if (vertex_kind[i0] == Kind_Seam)
+			{
+				// for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+				unsigned int s0 = wedge[i0];
+				unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
+
+				assert(s0 != i0 && wedge[s0] == i0);
+				assert(s1 != ~0u && remap[s1] == remap[i1]);
+
+				// note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe
+				s1 = (s1 != ~0u) ? s1 : wedge[i1];
+
+				ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]);
+				ej += c.bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0;
+			}
 		}
 
 		// pick edge direction with minimal error
@@ -1206,7 +1227,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 		}
 		else if (kind == Kind_Seam)
 		{
-			// for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
+			// for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges)
 			unsigned int s0 = wedge[i0];
 			unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0];
 			assert(s0 != i0 && wedge[s0] == i0);
@@ -1964,7 +1985,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic
 		printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' ');
 #endif
 
-		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback);
 
 		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
 
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
index 1dbd2e35f80..53cf9d753c0 100644
--- a/thirdparty/meshoptimizer/vertexcodec.cpp
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -60,6 +60,15 @@
 #define SIMD_LATENCYOPT
 #endif
 
+// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks
+#if defined(__GNUC__)
+#define SIMD_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define SIMD_UNREACHABLE() __assume(false)
+#else
+#define SIMD_UNREACHABLE() assert(!"Unreachable")
+#endif
+
 #endif // !MESHOPTIMIZER_NO_SIMD
 
 #ifdef SIMD_SSE
@@ -114,33 +123,44 @@ namespace meshopt
 const unsigned char kVertexHeader = 0xa0;
 
 static int gEncodeVertexVersion = 0;
+const int kDecodeVertexVersion = 1;
 
 const size_t kVertexBlockSizeBytes = 8192;
 const size_t kVertexBlockMaxSize = 256;
 const size_t kByteGroupSize = 16;
 const size_t kByteGroupDecodeLimit = 24;
-const size_t kTailMaxSize = 32;
+const size_t kTailMinSizeV0 = 32;
+const size_t kTailMinSizeV1 = 24;
+
+static const int kBitsV0[4] = {0, 2, 4, 8};
+static const int kBitsV1[5] = {0, 1, 2, 4, 8};
+
+const int kEncodeDefaultLevel = 2;
 
 static size_t getVertexBlockSize(size_t vertex_size)
 {
-	// make sure the entire block fits into the scratch buffer
-	size_t result = kVertexBlockSizeBytes / vertex_size;
-
-	// align to byte group size; we encode each byte as a byte group
-	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
-	result &= ~(kByteGroupSize - 1);
+	// make sure the entire block fits into the scratch buffer and is aligned to byte group size
+	// note: the block size is implicitly part of the format, so we can't change it without breaking compatibility
+	size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1);
 
 	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
 }
 
-inline unsigned char zigzag8(unsigned char v)
+inline unsigned int rotate(unsigned int v, int r)
 {
-	return ((signed char)(v) >> 7) ^ (v << 1);
+	return (v << r) | (v >> ((32 - r) & 31));
 }
 
-inline unsigned char unzigzag8(unsigned char v)
+template <typename T>
+inline T zigzag(T v)
 {
-	return -(v & 1) ^ (v >> 1);
+	return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1);
+}
+
+template <typename T>
+inline T unzigzag(T v)
+{
+	return (0 - (v & 1)) ^ (v >> 1);
 }
 
 #if TRACE
@@ -148,8 +168,9 @@ struct Stats
 {
 	size_t size;
 	size_t header;  // bytes for header
-	size_t bitg[4]; // bytes for bit groups
+	size_t bitg[9]; // bytes for bit groups
 	size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group
+	size_t ctrl[4]; // number of control groups
 };
 
 static Stats* bytestats = NULL;
@@ -158,18 +179,19 @@ static Stats vertexstats[256];
 
 static bool encodeBytesGroupZero(const unsigned char* buffer)
 {
-	for (size_t i = 0; i < kByteGroupSize; ++i)
-		if (buffer[i])
-			return false;
+	assert(kByteGroupSize == sizeof(unsigned long long) * 2);
 
-	return true;
+	unsigned long long v[2];
+	memcpy(v, buffer, sizeof(v));
+
+	return (v[0] | v[1]) == 0;
 }
 
 static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 {
-	assert(bits >= 1 && bits <= 8);
+	assert(bits >= 0 && bits <= 8);
 
-	if (bits == 1)
+	if (bits == 0)
 		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
 
 	if (bits == 8)
@@ -187,9 +209,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
 
 static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
 {
-	assert(bits >= 1 && bits <= 8);
+	assert(bits >= 0 && bits <= 8);
+	assert(kByteGroupSize % 8 == 0);
 
-	if (bits == 1)
+	if (bits == 0)
 		return data;
 
 	if (bits == 8)
@@ -217,21 +240,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char*
 			byte |= enc;
 		}
 
+		// encode 1-bit groups in reverse bit order
+		// this makes them faster to decode alongside other groups
+		if (bits == 1)
+			byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+
 		*data++ = byte;
 	}
 
 	for (size_t i = 0; i < kByteGroupSize; ++i)
 	{
-		if (buffer[i] >= sentinel)
-		{
-			*data++ = buffer[i];
-		}
+		unsigned char v = buffer[i];
+
+		// branchless append of out-of-range values
+		*data = v;
+		data += v >= sentinel;
 	}
 
 	return data;
 }
 
-static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4])
 {
 	assert(buffer_size % kByteGroupSize == 0);
 
@@ -247,39 +276,40 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 
 	memset(header, 0, header_size);
 
+	int last_bits = -1;
+
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
 	{
 		if (size_t(data_end - data) < kByteGroupDecodeLimit)
 			return NULL;
 
-		int best_bits = 8;
-		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+		int best_bitk = 3;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]);
 
-		for (int bits = 1; bits < 8; bits *= 2)
+		for (int bitk = 0; bitk < 3; ++bitk)
 		{
-			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]);
 
-			if (size < best_size)
+			// favor consistent bit selection across groups, but never replace literals
+			if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8))
 			{
-				best_bits = bits;
+				best_bitk = bitk;
 				best_size = size;
 			}
 		}
 
-		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3));
-		assert((1 << bitslog2) == best_bits);
-
 		size_t header_offset = i / kByteGroupSize;
+		header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2);
 
-		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
-
+		int best_bits = bits[best_bitk];
 		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
 
 		assert(data + best_size == next);
 		data = next;
+		last_bits = best_bits;
 
 #if TRACE
-		bytestats->bitg[bitslog2] += best_size;
+		bytestats->bitg[best_bits] += best_size;
 #endif
 	}
 
@@ -290,51 +320,252 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end,
 	return data;
 }
 
-static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template <typename T, bool Xor>
+static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot)
+{
+	size_t k0 = k & ~(sizeof(T) - 1);
+	int ks = (k & (sizeof(T) - 1)) * 8;
+
+	T p = last_vertex[k0];
+	for (size_t j = 1; j < sizeof(T); ++j)
+		p |= T(last_vertex[k0 + j]) << (j * 8);
+
+	const unsigned char* vertex = vertex_data + k0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		T v = vertex[0];
+		for (size_t j = 1; j < sizeof(T); ++j)
+			v |= vertex[j] << (j * 8);
+
+		T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p));
+
+		buffer[i] = (unsigned char)(d >> ks);
+		p = v;
+		vertex += vertex_size;
+	}
+}
+
+static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel)
+{
+	switch (channel & 3)
+	{
+	case 0:
+		return encodeDeltas1<unsigned char, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+	case 1:
+		return encodeDeltas1<unsigned short, false>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0);
+	case 2:
+		return encodeDeltas1<unsigned int, true>(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4);
+	default:
+		assert(!"Unsupported channel encoding"); // unreachable
+	}
+}
+
+static int estimateBits(unsigned char v)
+{
+	return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8;
+}
+
+static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size)
+{
+	size_t sizes[8] = {};
+
+	const unsigned char* vertex = vertex_data + k;
+	unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+
+	for (size_t i = 0; i < vertex_count; i += group_size)
+	{
+		unsigned int bitg = 0;
+
+		// calculate bit consistency mask for the group
+		for (size_t j = 0; j < group_size && i + j < vertex_count; ++j)
+		{
+			unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24);
+			unsigned int d = v ^ last;
+
+			bitg |= d;
+			last = v;
+			vertex += vertex_size;
+		}
+
+#if TRACE
+		for (int j = 0; j < 32; ++j)
+			vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1));
+#endif
+
+		for (int j = 0; j < 8; ++j)
+		{
+			unsigned int bitr = rotate(bitg, j);
+
+			sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8));
+			sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24));
+		}
+	}
+
+	int best_rot = 0;
+	for (int rot = 1; rot < 8; ++rot)
+		best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot;
+
+	return best_rot;
+}
+
+static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot)
+{
+	unsigned char block[kVertexBlockMaxSize];
+	assert(vertex_block_size <= kVertexBlockMaxSize);
+
+	unsigned char last_vertex[256] = {};
+
+	size_t sizes[3] = {};
+	assert(max_channel <= 3);
+
+	for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip)
+	{
+		size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i;
+		size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+		memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size);
+
+		// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+		if (block_size < block_size_aligned)
+			memset(block + block_size, 0, block_size_aligned - block_size);
+
+		for (int channel = 0; channel < max_channel; ++channel)
+			for (size_t j = 0; j < 4; ++j)
+			{
+				encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4));
+
+				for (size_t ig = 0; ig < block_size; ig += kByteGroupSize)
+				{
+					// to maximize encoding performance we only evaluate 1/2/4/8 bit groups
+					size_t size1 = encodeBytesGroupMeasure(block + ig, 1);
+					size_t size2 = encodeBytesGroupMeasure(block + ig, 2);
+					size_t size4 = encodeBytesGroupMeasure(block + ig, 4);
+					size_t size8 = encodeBytesGroupMeasure(block + ig, 8);
+
+					size_t best_size = size1 < size2 ? size1 : size2;
+					best_size = best_size < size4 ? best_size : size4;
+					best_size = best_size < size8 ? best_size : size8;
+
+					sizes[channel] += best_size;
+				}
+			}
+	}
+
+	int best_channel = 0;
+	for (int channel = 1; channel < max_channel; ++channel)
+		best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel;
+
+	return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel;
+}
+
+static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned)
+{
+	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+		if (!encodeBytesGroupZero(buffer + i))
+			return false;
+
+	return true;
+}
+
+static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level)
+{
+	if (estimateControlZero(buffer, vertex_count_aligned))
+		return 2; // zero encoding
+
+	if (level == 0)
+		return 1; // 1248 encoding in level 0 for encoding speed
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4;
+
+	size_t est_bytes0 = header_size, est_bytes1 = header_size;
+
+	for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize)
+	{
+		// assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance
+		size_t size0 = encodeBytesGroupMeasure(buffer + i, 0);
+		size_t size1 = encodeBytesGroupMeasure(buffer + i, 1);
+		size_t size2 = encodeBytesGroupMeasure(buffer + i, 2);
+		size_t size4 = encodeBytesGroupMeasure(buffer + i, 4);
+		size_t size8 = encodeBytesGroupMeasure(buffer + i, 8);
+
+		// both control modes have access to 1/2/4 bit encoding
+		size_t size12 = size1 < size2 ? size1 : size2;
+		size_t size124 = size12 < size4 ? size12 : size4;
+
+		// each control mode has access to 0/8 bit encoding respectively
+		est_bytes0 += size124 < size0 ? size124 : size0;
+		est_bytes1 += size124 < size8 ? size124 : size8;
+	}
+
+	// pick shortest control entry but prefer literal encoding
+	if (est_bytes0 < vertex_count || est_bytes1 < vertex_count)
+		return est_bytes0 < est_bytes1 ? 0 : 1;
+	else
+		return 3; // literal encoding
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+	assert(vertex_size % 4 == 0);
 
 	unsigned char buffer[kVertexBlockMaxSize];
 	assert(sizeof(buffer) % kByteGroupSize == 0);
 
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
 	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
 	memset(buffer, 0, sizeof(buffer));
 
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	unsigned char* control = data;
+	data += control_size;
+
+	memset(control, 0, control_size);
+
 	for (size_t k = 0; k < vertex_size; ++k)
 	{
-		size_t vertex_offset = k;
-
-		unsigned char p = last_vertex[k];
-
-		for (size_t i = 0; i < vertex_count; ++i)
-		{
-			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
-
-			p = vertex_data[vertex_offset];
-
-			vertex_offset += vertex_size;
-		}
+		encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]);
 
 #if TRACE
 		const unsigned char* olddata = data;
 		bytestats = &vertexstats[k];
-
-		for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize)
-		{
-			unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k];
-			unsigned char delta = 0xff;
-
-			for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i)
-				delta &= ~(vertex_data[vertex_size * i + k] ^ last);
-
-			for (int j = 0; j < 8; ++j)
-				bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1);
-		}
 #endif
 
-		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
-		if (!data)
-			return NULL;
+		int ctrl = 0;
+
+		if (version != 0)
+		{
+			ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level);
+
+			assert(unsigned(ctrl) < 4);
+			control[k / 4] |= ctrl << ((k % 4) * 2);
+
+#if TRACE
+			vertexstats[k].ctrl[ctrl]++;
+#endif
+		}
+
+		if (ctrl == 3)
+		{
+			// literal encoding
+			if (size_t(data_end - data) < vertex_count)
+				return NULL;
+
+			memcpy(data, buffer, vertex_count);
+			data += vertex_count;
+		}
+		else if (ctrl != 2) // non-zero encoding
+		{
+			data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+			if (!data)
+				return NULL;
+		}
 
 #if TRACE
 		bytestats = NULL;
@@ -348,7 +579,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data
 }
 
 #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM))
-static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits)
 {
 #define READ() byte = *data++
 #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
@@ -356,12 +587,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 	unsigned char byte, enc, encv;
 	const unsigned char* data_var;
 
-	switch (bitslog2)
+	switch (bits)
 	{
 	case 0:
 		memset(buffer, 0, kByteGroupSize);
 		return data;
 	case 1:
+		data_var = data + 2;
+
+		// 2 groups with 8 1-bit values in each byte (reversed from the order in other groups)
+		READ();
+		byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+		NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+		READ();
+		byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32);
+		NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1);
+
+		return data_var;
+	case 2:
 		data_var = data + 4;
 
 		// 4 groups with 4 2-bit values in each byte
@@ -371,7 +614,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
 
 		return data_var;
-	case 2:
+	case 4:
 		data_var = data + 8;
 
 		// 8 groups with 2 4-bit values in each byte
@@ -385,11 +628,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 		READ(), NEXT(4), NEXT(4);
 
 		return data_var;
-	case 3:
+	case 8:
 		memcpy(buffer, data, kByteGroupSize);
 		return data + kByteGroupSize;
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		assert(!"Unexpected bit length"); // unreachable
 		return data;
 	}
 
@@ -397,18 +640,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned
 #undef NEXT
 }
 
-static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits)
 {
 	assert(buffer_size % kByteGroupSize == 0);
 
-	const unsigned char* header = data;
-
 	// round number of groups to 4 to get number of header bytes
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
 	if (size_t(data_end - data) < header_size)
 		return NULL;
 
+	const unsigned char* header = data;
 	data += header_size;
 
 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
@@ -417,43 +658,108 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne
 			return NULL;
 
 		size_t header_offset = i / kByteGroupSize;
+		int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
 
-		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
-		data = decodeBytesGroup(data, buffer + i, bitslog2);
+		data = decodeBytesGroup(data, buffer + i, bits[bitsk]);
 	}
 
 	return data;
 }
 
-static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+template <typename T, bool Xor>
+static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot)
+{
+	for (size_t k = 0; k < 4; k += sizeof(T))
+	{
+		size_t vertex_offset = k;
+
+		T p = last_vertex[0];
+		for (size_t j = 1; j < sizeof(T); ++j)
+			p |= last_vertex[j] << (8 * j);
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			T v = buffer[i];
+			for (size_t j = 1; j < sizeof(T); ++j)
+				v |= buffer[i + vertex_count * j] << (8 * j);
+
+			v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p;
+
+			for (size_t j = 0; j < sizeof(T); ++j)
+				transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8));
+
+			p = v;
+
+			vertex_offset += vertex_size;
+		}
+
+		buffer += vertex_count * sizeof(T);
+		last_vertex += sizeof(T);
+	}
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
 
-	unsigned char buffer[kVertexBlockMaxSize];
+	unsigned char buffer[kVertexBlockMaxSize * 4];
 	unsigned char transposed[kVertexBlockSizeBytes];
 
 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 	assert(vertex_count <= vertex_count_aligned);
 
-	for (size_t k = 0; k < vertex_size; ++k)
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	const unsigned char* control = data;
+	data += control_size;
+
+	for (size_t k = 0; k < vertex_size; k += 4)
 	{
-		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
-		if (!data)
-			return NULL;
+		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
 
-		size_t vertex_offset = k;
-
-		unsigned char p = last_vertex[k];
-
-		for (size_t i = 0; i < vertex_count; ++i)
+		for (size_t j = 0; j < 4; ++j)
 		{
-			unsigned char v = unzigzag8(buffer[i]) + p;
+			int ctrl = (ctrl_byte >> (j * 2)) & 3;
 
-			transposed[vertex_offset] = v;
-			p = v;
+			if (ctrl == 3)
+			{
+				// literal encoding
+				if (size_t(data_end - data) < vertex_count)
+					return NULL;
 
-			vertex_offset += vertex_size;
+				memcpy(buffer + j * vertex_count, data, vertex_count);
+				data += vertex_count;
+			}
+			else if (ctrl == 2)
+			{
+				// zero encoding
+				memset(buffer + j * vertex_count, 0, vertex_count);
+			}
+			else
+			{
+				data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl);
+				if (!data)
+					return NULL;
+			}
+		}
+
+		int channel = version == 0 ? 0 : channels[k / 4];
+
+		switch (channel & 3)
+		{
+		case 0:
+			decodeDeltas1<unsigned char, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+			break;
+		case 1:
+			decodeDeltas1<unsigned short, false>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0);
+			break;
+		case 2:
+			decodeDeltas1<unsigned int, true>(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+			break;
+		default:
+			return NULL; // invalid channel type
 		}
 	}
 
@@ -499,7 +805,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
 
 #ifdef SIMD_SSE
 SIMD_TARGET
-static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 {
 	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
 	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
@@ -511,11 +817,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 }
 
 SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		__m128i result = _mm_setzero_si128();
 
@@ -525,6 +832,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 #ifdef __GNUC__
 		typedef int __attribute__((aligned(1))) unaligned_int;
@@ -557,7 +865,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
 
 		__m128i shuf = decodeShuffleMask(mask0, mask1);
-
 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -570,6 +877,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned long long data64;
@@ -593,7 +901,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
 
 		__m128i shuf = decodeShuffleMask(mask0, mask1);
-
 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
@@ -606,6 +913,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
 
@@ -614,26 +922,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 2));
+
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+		__m128i result = _mm_shuffle_epi8(rest, shuf);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_AVX
-static const __m128i decodeBytesGroupConfig[] = {
-    _mm_set1_epi8(3),
-    _mm_set1_epi8(15),
-    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
-    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+static const __m128i kDecodeBytesGroupConfig[8][2] = {
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_setzero_si128(), _mm_setzero_si128()},
+    {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)},
+    {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)},
+    {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)},
 };
 
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		__m128i result = _mm_setzero_si128();
 
@@ -642,16 +970,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data;
 	}
 
-	case 1:
-	case 2:
+	case 5: // 1-bit
+	case 1: // 2-bit
+	case 6:
+	case 2: // 4-bit
+	case 7:
 	{
-		const unsigned char* skip = data + (bitslog2 << 2);
+		const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5));
 
 		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
 
-		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
-		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+		__m128i sent = kDecodeBytesGroupConfig[hbits][0];
+		__m128i ctrl = kDecodeBytesGroupConfig[hbits][1];
 
 		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
 		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
@@ -665,6 +996,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
 
@@ -674,14 +1006,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_NEON
-static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+SIMD_TARGET
+inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
 {
 	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
 	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
@@ -692,7 +1024,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
 	return vcombine_u8(r0, r1);
 }
 
-static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+SIMD_TARGET
+inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
 {
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
@@ -703,11 +1036,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m
 	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
 }
 
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+SIMD_TARGET
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		uint8x16_t result = vdupq_n_u8(0);
 
@@ -717,6 +1052,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned int data32;
@@ -754,6 +1090,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 #ifdef SIMD_LATENCYOPT
 		unsigned long long data64;
@@ -788,6 +1125,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		uint8x16_t result = vld1q_u8(data);
 
@@ -796,30 +1134,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		uint8x8_t rest0 = vld1_u8(data + 2);
+		uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1);
+
+		vst1q_u8(buffer, result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #ifdef SIMD_WASM
 SIMD_TARGET
-static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 {
 	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
 	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
 
-	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
-	sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-
+	v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]);
 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
 
 	return wasmx_unpacklo_v64x2(sm0, sm1r);
 }
 
 SIMD_TARGET
-static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
@@ -829,11 +1179,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 }
 
 SIMD_TARGET
-static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits)
 {
-	switch (bitslog2)
+	switch (hbits)
 	{
 	case 0:
+	case 4:
 	{
 		v128_t result = wasm_i8x16_splat(0);
 
@@ -843,6 +1194,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 1:
+	case 6:
 	{
 		v128_t sel2 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 4);
@@ -857,7 +1209,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		wasmMoveMask(mask, mask0, mask1);
 
 		v128_t shuf = decodeShuffleMask(mask0, mask1);
-
 		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
 
 		wasm_v128_store(buffer, result);
@@ -866,6 +1217,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 2:
+	case 7:
 	{
 		v128_t sel4 = wasm_v128_load(data);
 		v128_t rest = wasm_v128_load(data + 8);
@@ -879,7 +1231,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		wasmMoveMask(mask, mask0, mask1);
 
 		v128_t shuf = decodeShuffleMask(mask0, mask1);
-
 		v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask);
 
 		wasm_v128_store(buffer, result);
@@ -888,6 +1239,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 	}
 
 	case 3:
+	case 8:
 	{
 		v128_t result = wasm_v128_load(data);
 
@@ -896,16 +1248,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		return data + 16;
 	}
 
+	case 5:
+	{
+		v128_t rest = wasm_v128_load(data + 2);
+
+		unsigned char mask0 = data[0];
+		unsigned char mask1 = data[1];
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+		v128_t result = wasm_i8x16_swizzle(rest, shuf);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
 	default:
-		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
-		return data;
+		SIMD_UNREACHABLE(); // unreachable
 	}
 }
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_AVX)
 SIMD_TARGET
-static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
 {
 	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
 	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
@@ -919,17 +1285,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
 }
 
 SIMD_TARGET
-static __m128i unzigzag8(__m128i v)
+inline __m128i unzigzag8(__m128i v)
 {
 	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
 	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
 
 	return _mm_xor_si128(xl, xr);
 }
+
+SIMD_TARGET
+inline __m128i unzigzag16(__m128i v)
+{
+	__m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1)));
+	__m128i xr = _mm_srli_epi16(v, 1);
+
+	return _mm_xor_si128(xl, xr);
+}
+
+SIMD_TARGET
+inline __m128i rotate32(__m128i v, int r)
+{
+	return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r));
+}
 #endif
 
 #ifdef SIMD_NEON
-static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+SIMD_TARGET
+inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
 {
 	uint8x16x2_t t01 = vzipq_u8(x0, x1);
 	uint8x16x2_t t23 = vzipq_u8(x2, x3);
@@ -943,18 +1325,64 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_
 	x3 = vreinterpretq_u8_u16(x23.val[1]);
 }
 
-static uint8x16_t unzigzag8(uint8x16_t v)
+SIMD_TARGET
+inline uint8x16_t unzigzag8(uint8x16_t v)
 {
 	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
 	uint8x16_t xr = vshrq_n_u8(v, 1);
 
 	return veorq_u8(xl, xr);
 }
+
+SIMD_TARGET
+inline uint8x16_t unzigzag16(uint8x16_t v)
+{
+	uint16x8_t vv = vreinterpretq_u16_u8(v);
+	uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1)))));
+	uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1));
+
+	return veorq_u8(xl, xr);
+}
+
+SIMD_TARGET
+inline uint8x16_t rotate32(uint8x16_t v, int r)
+{
+	uint32x4_t v32 = vreinterpretq_u32_u8(v);
+	return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32))));
+}
+
+template <int Channel>
+SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3)
+{
+	switch (Channel)
+	{
+	case 0:
+	{
+		uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3));
+		uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+		return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+	}
+	case 1:
+	{
+		uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3)));
+		uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum));
+		return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2)));
+	}
+	case 2:
+	{
+		uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3));
+		uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum));
+		return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4));
+	}
+	default:
+		return npi;
+	}
+}
 #endif
 
 #ifdef SIMD_WASM
 SIMD_TARGET
-static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 {
 	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
 	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
@@ -968,44 +1396,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
 }
 
 SIMD_TARGET
-static v128_t unzigzag8(v128_t v)
+inline v128_t unzigzag8(v128_t v)
 {
 	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
 	v128_t xr = wasm_u8x16_shr(v, 1);
 
 	return wasm_v128_xor(xl, xr);
 }
+
+SIMD_TARGET
+inline v128_t unzigzag16(v128_t v)
+{
+	v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1)));
+	v128_t xr = wasm_u16x8_shr(v, 1);
+
+	return wasm_v128_xor(xl, xr);
+}
+
+SIMD_TARGET
+inline v128_t rotate32(v128_t v, int r)
+{
+	return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r));
+}
 #endif
 
 #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
 SIMD_TARGET
-static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift)
 {
 	assert(buffer_size % kByteGroupSize == 0);
 	assert(kByteGroupSize == 16);
 
-	const unsigned char* header = data;
-
 	// round number of groups to 4 to get number of header bytes
 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
-
 	if (size_t(data_end - data) < header_size)
 		return NULL;
 
+	const unsigned char* header = data;
 	data += header_size;
 
 	size_t i = 0;
 
-	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+	// fast-path: process 4 groups at a time, do a shared bounds check
 	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
 	{
 		size_t header_offset = i / kByteGroupSize;
 		unsigned char header_byte = header[header_offset / 4];
 
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
-		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3));
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3));
 	}
 
 	// slow-path: process remaining groups
@@ -1015,17 +1456,102 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns
 			return NULL;
 
 		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
 
-		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
-
-		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+		data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3));
 	}
 
 	return data;
 }
 
+template <int Channel>
+SIMD_TARGET static void
+decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot)
+{
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i))
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i))
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i))
+#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size
+#endif
+
+#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot))
+
+	PREP();
+
+	unsigned char* savep = transposed;
+
+	for (size_t j = 0; j < vertex_count_aligned; j += 16)
+	{
+		LOAD(0);
+		LOAD(1);
+		LOAD(2);
+		LOAD(3);
+
+		transpose8(r0, r1, r2, r3);
+
+		TEMP t0, t1, t2, t3;
+		TEMP npi = pi;
+
+		UNZR(0);
+		GRP4(0);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(1);
+		GRP4(1);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(2);
+		GRP4(2);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+		UNZR(3);
+		GRP4(3);
+		FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+		SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32))
+		// instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations
+		pi = rebase<Channel>(npi, r0, r1, r2, r3);
+#else
+		(void)npi;
+#endif
+
+#undef UNZR
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+	}
+}
+
 SIMD_TARGET
-static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version)
 {
 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
 
@@ -1034,84 +1560,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con
 
 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
 
+	size_t control_size = version == 0 ? 0 : vertex_size / 4;
+	if (size_t(data_end - data) < control_size)
+		return NULL;
+
+	const unsigned char* control = data;
+	data += control_size;
+
 	for (size_t k = 0; k < vertex_size; k += 4)
 	{
+		unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4];
+
 		for (size_t j = 0; j < 4; ++j)
 		{
-			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
-			if (!data)
-				return NULL;
+			int ctrl = (ctrl_byte >> (j * 2)) & 3;
+
+			if (ctrl == 3)
+			{
+				// literal encoding; safe to over-copy due to tail
+				if (size_t(data_end - data) < vertex_count_aligned)
+					return NULL;
+
+				memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned);
+				data += vertex_count;
+			}
+			else if (ctrl == 2)
+			{
+				// zero encoding
+				memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned);
+			}
+			else
+			{
+				// for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8
+				int hshift = version == 0 ? 0 : 4 + ctrl;
+
+				data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift);
+				if (!data)
+					return NULL;
+			}
 		}
 
-#if defined(SIMD_SSE) || defined(SIMD_AVX)
-#define TEMP __m128i
-#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
-#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
-#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
-#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
-#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
-#endif
+		int channel = version == 0 ? 0 : channels[k / 4];
 
-#ifdef SIMD_NEON
-#define TEMP uint8x8_t
-#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
-#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
-#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
-#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
-#endif
-
-#ifdef SIMD_WASM
-#define TEMP v128_t
-#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
-#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
-#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
-#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
-#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
-#endif
-
-		PREP();
-
-		unsigned char* savep = transposed + k;
-
-		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		switch (channel & 3)
 		{
-			LOAD(0);
-			LOAD(1);
-			LOAD(2);
-			LOAD(3);
-
-			r0 = unzigzag8(r0);
-			r1 = unzigzag8(r1);
-			r2 = unzigzag8(r2);
-			r3 = unzigzag8(r3);
-
-			transpose8(r0, r1, r2, r3);
-
-			TEMP t0, t1, t2, t3;
-
-			GRP4(0);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(1);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(2);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-			GRP4(3);
-			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
-			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
-
-#undef TEMP
-#undef PREP
-#undef LOAD
-#undef GRP4
-#undef FIXD
-#undef SAVE
+		case 0:
+			decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+			break;
+		case 1:
+			decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0);
+			break;
+		case 2:
+			decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31);
+			break;
+		default:
+			return NULL; // invalid channel type
 		}
 	}
 
@@ -1140,12 +1643,13 @@ static unsigned int cpuid = getCpuFeatures();
 
 } // namespace meshopt
 
-size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level)
 {
 	using namespace meshopt;
 
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
+	assert(level >= 0 && level <= 9); // only a subset of this range is used right now
 
 #if TRACE
 	memset(vertexstats, 0, sizeof(vertexstats));
@@ -1156,7 +1660,7 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 	unsigned char* data = buffer;
 	unsigned char* data_end = buffer + buffer_size;
 
-	if (size_t(data_end - data) < 1 + vertex_size)
+	if (size_t(data_end - data) < 1)
 		return 0;
 
 	int version = gEncodeVertexVersion;
@@ -1172,34 +1676,52 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 
+	unsigned char channels[64] = {};
+	if (version != 0 && level > 1 && vertex_count > 1)
+		for (size_t k = 0; k < vertex_size; k += 4)
+		{
+			int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0;
+			int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot);
+
+			assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8));
+			channels[k / 4] = (unsigned char)channel;
+		}
+
 	size_t vertex_offset = 0;
 
 	while (vertex_offset < vertex_count)
 	{
 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
 
-		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level);
 		if (!data)
 			return 0;
 
 		vertex_offset += block_size;
 	}
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
 
-	if (size_t(data_end - data) < tail_size)
+	if (size_t(data_end - data) < tail_size_pad)
 		return 0;
 
-	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
-	if (vertex_size < kTailMaxSize)
+	if (tail_size < tail_size_pad)
 	{
-		memset(data, 0, kTailMaxSize - vertex_size);
-		data += kTailMaxSize - vertex_size;
+		memset(data, 0, tail_size_pad - tail_size);
+		data += tail_size_pad - tail_size;
 	}
 
 	memcpy(data, first_vertex, vertex_size);
 	data += vertex_size;
 
+	if (version != 0)
+	{
+		memcpy(data, channels, vertex_size / 4);
+		data += vertex_size / 4;
+	}
+
 	assert(data >= buffer + tail_size);
 	assert(data <= buffer + buffer_size);
 
@@ -1212,17 +1734,40 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 
 		printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
 
-		size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3];
+		size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8];
+		double total_kr = total_k ? 1.0 / double(total_k) : 0;
 
-		printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]",
-		    double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100,
-		    double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100);
+		if (version != 0)
+		{
+			int channel = channels[k / 4];
+
+			if ((channel & 3) == 2 && k % 4 == 0)
+				printf(" | ^%d", channel >> 4);
+			else
+				printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : "."));
+		}
+
+		printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]",
+		    double(vsk.header) * total_kr * 100,
+		    double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100,
+		    double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100);
+
+		size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3];
+
+		if (total_ctrl)
+		{
+			printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%",
+			    double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100,
+			    double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100);
+		}
+
+		if (level >= 3)
+			printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
+			    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
+			    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
+			    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
+			    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
 
-		printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]",
-		    double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100,
-		    double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100,
-		    double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100,
-		    double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100);
 		printf("\n");
 	}
 #endif
@@ -1230,6 +1775,11 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con
 	return data - buffer;
 }
 
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel);
+}
+
 size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 {
 	using namespace meshopt;
@@ -1240,21 +1790,42 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
 
+	size_t vertex_block_control_size = vertex_size / 4;
 	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
 	size_t vertex_block_data_size = vertex_block_size;
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+	size_t tail_size = vertex_size + (vertex_size / 4);
+	size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+	assert(tail_size_pad >= kByteGroupDecodeLimit);
 
-	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+	return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad;
 }
 
 void meshopt_encodeVertexVersion(int version)
 {
-	assert(unsigned(version) <= 0);
+	assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion));
 
 	meshopt::gEncodeVertexVersion = version;
 }
 
+int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size)
+{
+	if (buffer_size < 1)
+		return -1;
+
+	unsigned char header = buffer[0];
+
+	if ((header & 0xf0) != meshopt::kVertexHeader)
+		return -1;
+
+	int version = header & 0x0f;
+	if (version > meshopt::kDecodeVertexVersion)
+		return -1;
+
+	return version;
+}
+
 int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
 {
 	using namespace meshopt;
@@ -1262,7 +1833,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	assert(vertex_size > 0 && vertex_size <= 256);
 	assert(vertex_size % 4 == 0);
 
-	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL;
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL;
 
 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
 	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
@@ -1282,7 +1853,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	const unsigned char* data = buffer;
 	const unsigned char* data_end = buffer + buffer_size;
 
-	if (size_t(data_end - data) < 1 + vertex_size)
+	if (size_t(data_end - data) < 1)
 		return -2;
 
 	unsigned char data_header = *data++;
@@ -1291,11 +1862,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 		return -1;
 
 	int version = data_header & 0x0f;
-	if (version > 0)
+	if (version > kDecodeVertexVersion)
 		return -1;
 
+	size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4);
+	size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1;
+	size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size;
+
+	if (size_t(data_end - data) < tail_size_pad)
+		return -2;
+
+	const unsigned char* tail = data_end - tail_size;
+
 	unsigned char last_vertex[256];
-	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+	memcpy(last_vertex, tail, vertex_size);
+
+	const unsigned char* channels = version == 0 ? NULL : tail + vertex_size;
 
 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
 
@@ -1305,16 +1887,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve
 	{
 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
 
-		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version);
 		if (!data)
 			return -2;
 
 		vertex_offset += block_size;
 	}
 
-	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
-
-	if (size_t(data_end - data) != tail_size)
+	if (size_t(data_end - data) != tail_size_pad)
 		return -3;
 
 	return 0;