diff --git a/thirdparty/README.md b/thirdparty/README.md index 90f516090d4..4898294be50 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -662,7 +662,7 @@ Patches: ## meshoptimizer - Upstream: https://github.com/zeux/meshoptimizer -- Version: 0.22 (4affad044571506a5724c9a6f15424f43e86f731, 2024) +- Version: 0.23 (3e9d1ff3135794f519f3237515277c8d9a3fd3f2, 2025) - License: MIT Files extracted from upstream repository: @@ -670,10 +670,6 @@ Files extracted from upstream repository: - All files in `src/` - `LICENSE.md` -Patches: - -- `0001-simplifier-distance-only-error.patch` (GH-98529) - ## mingw-std-threads diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md index ef9f5919f27..a5c3b1ccca7 100644 --- a/thirdparty/meshoptimizer/LICENSE.md +++ b/thirdparty/meshoptimizer/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2016-2024 Arseny Kapoulkine +Copyright (c) 2016-2025 Arseny Kapoulkine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp index 738add5f2fe..26d2fb11c51 100644 --- a/thirdparty/meshoptimizer/clusterizer.cpp +++ b/thirdparty/meshoptimizer/clusterizer.cpp @@ -13,12 +13,16 @@ namespace meshopt { -// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet -const size_t kMeshletMaxVertices = 255; +// This must be <= 256 since meshlet indices are stored as bytes +const size_t kMeshletMaxVertices = 256; // A reasonable limit is around 2*max_vertices or less const size_t kMeshletMaxTriangles = 512; +// We keep a limited number of seed triangles and add a few triangles per finished meshlet +const size_t kMeshletMaxSeeds = 256; +const size_t kMeshletAddSeeds = 4; + struct TriangleAdjacency2 { unsigned int* counts; @@ -70,72 +74,152 @@ static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned for (size_t i = 0; i < vertex_count; ++i) { assert(adjacency.offsets[i] >= adjacency.counts[i]); - adjacency.offsets[i] -= adjacency.counts[i]; } } -static void computeBoundingSphere(float result[4], const float points[][3], size_t count) +static void buildTriangleAdjacencySparse(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + size_t face_count = index_count / 3; + + // sparse mode can build adjacency more quickly by ignoring unused vertices, using a bit to mark visited vertices + const unsigned int sparse_seen = 1u << 31; + assert(index_count < sparse_seen); + + // allocate arrays + adjacency.counts = allocator.allocate(vertex_count); + adjacency.offsets = allocator.allocate(vertex_count); + adjacency.data = allocator.allocate(index_count); + + // fill triangle counts + for (size_t i = 0; i < index_count; ++i) + assert(indices[i] < vertex_count); + + for (size_t i = 0; i < index_count; ++i) + adjacency.counts[indices[i]] = 0; + + for (size_t i = 0; i < index_count; ++i) + adjacency.counts[indices[i]]++; + + // fill offset table; uses sparse_seen bit to tag visited vertices + unsigned int offset = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int v = indices[i]; + + if ((adjacency.counts[v] & sparse_seen) == 0) + { + adjacency.offsets[v] = offset; + offset += adjacency.counts[v]; + adjacency.counts[v] |= sparse_seen; + } + } + + assert(offset == index_count); + + // fill triangle data + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + + adjacency.data[adjacency.offsets[a]++] = unsigned(i); + adjacency.data[adjacency.offsets[b]++] = unsigned(i); + adjacency.data[adjacency.offsets[c]++] = unsigned(i); + } + + // fix offsets that have been disturbed by the previous pass + // also fix counts (that were marked with sparse_seen by the first pass) + for (size_t i = 0; i < index_count; ++i) + { + unsigned int v = indices[i]; + + if (adjacency.counts[v] & sparse_seen) + { + adjacency.counts[v] &= ~sparse_seen; + + assert(adjacency.offsets[v] >= adjacency.counts[v]); + adjacency.offsets[v] -= adjacency.counts[v]; + } + } +} + +static void computeBoundingSphere(float result[4], const float* points, size_t count, size_t points_stride, const float* radii, size_t radii_stride) { assert(count > 0); + size_t points_stride_float = points_stride / sizeof(float); + size_t radii_stride_float = radii_stride / sizeof(float); + // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates size_t pmin[3] = {0, 0, 0}; size_t pmax[3] = {0, 0, 0}; for (size_t i = 0; i < count; ++i) { - const float* p = points[i]; + const float* p = points + i * points_stride_float; + float r = radii[i * radii_stride_float]; for (int axis = 0; axis < 3; ++axis) { - pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis]; - pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis]; + float bmin = points[pmin[axis] * points_stride_float + axis] - radii[pmin[axis] * radii_stride_float]; + float bmax = points[pmax[axis] * points_stride_float + axis] + radii[pmax[axis] * radii_stride_float]; + + pmin[axis] = (p[axis] - r < bmin) ? i : pmin[axis]; + pmax[axis] = (p[axis] + r > bmax) ? i : pmax[axis]; } } // find the pair of points with largest distance - float paxisd2 = 0; int paxis = 0; + float paxisdr = 0; for (int axis = 0; axis < 3; ++axis) { - const float* p1 = points[pmin[axis]]; - const float* p2 = points[pmax[axis]]; + const float* p1 = points + pmin[axis] * points_stride_float; + const float* p2 = points + pmax[axis] * points_stride_float; + float r1 = radii[pmin[axis] * radii_stride_float]; + float r2 = radii[pmax[axis] * radii_stride_float]; float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]); + float dr = sqrtf(d2) + r1 + r2; - if (d2 > paxisd2) + if (dr > paxisdr) { - paxisd2 = d2; + paxisdr = dr; paxis = axis; } } // use the longest segment as the initial sphere diameter - const float* p1 = points[pmin[paxis]]; - const float* p2 = points[pmax[paxis]]; + const float* p1 = points + pmin[paxis] * points_stride_float; + const float* p2 = points + pmax[paxis] * points_stride_float; + float r1 = radii[pmin[paxis] * radii_stride_float]; + float r2 = radii[pmax[paxis] * radii_stride_float]; - float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2}; - float radius = sqrtf(paxisd2) / 2; + float paxisd = sqrtf((p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2])); + float paxisk = paxisd > 0 ? (paxisd + r2 - r1) / (2 * paxisd) : 0.f; + + float center[3] = {p1[0] + (p2[0] - p1[0]) * paxisk, p1[1] + (p2[1] - p1[1]) * paxisk, p1[2] + (p2[2] - p1[2]) * paxisk}; + float radius = paxisdr / 2; // iteratively adjust the sphere up until all points fit for (size_t i = 0; i < count; ++i) { - const float* p = points[i]; + const float* p = points + i * points_stride_float; + float r = radii[i * radii_stride_float]; + float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); + float d = sqrtf(d2); - if (d2 > radius * radius) + if (d + r > radius) { - float d = sqrtf(d2); - assert(d > 0); + float k = d > 0 ? (d + r - radius) / (2 * d) : 0.f; - float k = 0.5f + (radius / d) / 2; - - center[0] = center[0] * k + p[0] * (1 - k); - center[1] = center[1] * k + p[1] * (1 - k); - center[2] = center[2] * k + p[2] * (1 - k); - radius = (radius + d) / 2; + center[0] += k * (p[0] - center[0]); + center[1] += k * (p[1] - center[1]); + center[2] += k * (p[2] - center[2]); + radius = (radius + d + r) / 2; } } @@ -151,12 +235,25 @@ struct Cone float nx, ny, nz; }; -static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius) +static float getDistance(float dx, float dy, float dz, bool aa) { + if (!aa) + return sqrtf(dx * dx + dy * dy + dz * dz); + + float rx = fabsf(dx), ry = fabsf(dy), rz = fabsf(dz); + float rxy = rx > ry ? rx : ry; + return rxy > rz ? rxy : rz; +} + +static float getMeshletScore(float distance, float spread, float cone_weight, float expected_radius) +{ + if (cone_weight < 0) + return 1 + distance / expected_radius; + float cone = 1.f - spread * cone_weight; float cone_clamped = cone < 1e-3f ? 1e-3f : cone; - return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped; + return (1 + distance / expected_radius * (1 - cone_weight)) * cone_clamped; } static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count) @@ -230,22 +327,22 @@ static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_trian meshlet_triangles[offset++] = 0; } -static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles) +static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, short* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles, bool split = false) { - unsigned char& av = used[a]; - unsigned char& bv = used[b]; - unsigned char& cv = used[c]; + short& av = used[a]; + short& bv = used[b]; + short& cv = used[c]; bool result = false; - int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); + int used_extra = (av < 0) + (bv < 0) + (cv < 0); - if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles) + if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles || split) { meshlets[meshlet_offset] = meshlet; for (size_t j = 0; j < meshlet.vertex_count; ++j) - used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff; + used[meshlet_vertices[meshlet.vertex_offset + j]] = -1; finishMeshlet(meshlet, meshlet_triangles); @@ -257,33 +354,33 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int result = true; } - if (av == 0xff) + if (av < 0) { - av = (unsigned char)meshlet.vertex_count; + av = short(meshlet.vertex_count); meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a; } - if (bv == 0xff) + if (bv < 0) { - bv = (unsigned char)meshlet.vertex_count; + bv = short(meshlet.vertex_count); meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b; } - if (cv == 0xff) + if (cv < 0) { - cv = (unsigned char)meshlet.vertex_count; + cv = short(meshlet.vertex_count); meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c; } - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av; - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv; - meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = (unsigned char)av; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = (unsigned char)bv; + meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = (unsigned char)cv; meshlet.triangle_count++; return result; } -static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight) +static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone& meshlet_cone, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const short* used, float meshlet_expected_radius, float cone_weight) { unsigned int best_triangle = ~0u; int best_priority = 5; @@ -301,7 +398,7 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co unsigned int triangle = neighbors[j]; unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; - int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); + int extra = (used[a] < 0) + (used[b] < 0) + (used[c] < 0); assert(extra <= 2); int priority = -1; @@ -323,27 +420,13 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co if (priority > best_priority) continue; - float score = 0; + const Cone& tri_cone = triangles[triangle]; - // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles) - if (meshlet_cone) - { - const Cone& tri_cone = triangles[triangle]; + float dx = tri_cone.px - meshlet_cone.px, dy = tri_cone.py - meshlet_cone.py, dz = tri_cone.pz - meshlet_cone.pz; + float distance = getDistance(dx, dy, dz, cone_weight < 0); + float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz; - float distance2 = - (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) + - (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) + - (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz); - - float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz; - - score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius); - } - else - { - // each live_triangles entry is >= 1 since it includes the current triangle we're processing - score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3); - } + float score = getMeshletScore(distance, spread, cone_weight, meshlet_expected_radius); // note that topology-based priority is always more important than the score // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost @@ -359,6 +442,113 @@ static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Co return best_triangle; } +static size_t appendSeedTriangles(unsigned int* seeds, const meshopt_Meshlet& meshlet, const unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz) +{ + unsigned int best_seeds[kMeshletAddSeeds]; + unsigned int best_live[kMeshletAddSeeds]; + float best_score[kMeshletAddSeeds]; + + for (size_t i = 0; i < kMeshletAddSeeds; ++i) + { + best_seeds[i] = ~0u; + best_live[i] = ~0u; + best_score[i] = FLT_MAX; + } + + for (size_t i = 0; i < meshlet.vertex_count; ++i) + { + unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; + + unsigned int best_neighbor = ~0u; + unsigned int best_neighbor_live = ~0u; + + // find the neighbor with the smallest live metric + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; + + for (size_t j = 0; j < neighbors_size; ++j) + { + unsigned int triangle = neighbors[j]; + unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; + + unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; + + if (live < best_neighbor_live) + { + best_neighbor = triangle; + best_neighbor_live = live; + } + } + + // add the neighbor to the list of seeds; the list is unsorted and the replacement criteria is approximate + if (best_neighbor == ~0u) + continue; + + float best_neighbor_score = getDistance(triangles[best_neighbor].px - cornerx, triangles[best_neighbor].py - cornery, triangles[best_neighbor].pz - cornerz, false); + + for (size_t j = 0; j < kMeshletAddSeeds; ++j) + { + // non-strict comparison reduces the number of duplicate seeds (triangles adjacent to multiple vertices) + if (best_neighbor_live < best_live[j] || (best_neighbor_live == best_live[j] && best_neighbor_score <= best_score[j])) + { + best_seeds[j] = best_neighbor; + best_live[j] = best_neighbor_live; + best_score[j] = best_neighbor_score; + break; + } + } + } + + // add surviving seeds to the meshlet + size_t seed_count = 0; + + for (size_t i = 0; i < kMeshletAddSeeds; ++i) + if (best_seeds[i] != ~0u) + seeds[seed_count++] = best_seeds[i]; + + return seed_count; +} + +static size_t pruneSeedTriangles(unsigned int* seeds, size_t seed_count, const unsigned char* emitted_flags) +{ + size_t result = 0; + + for (size_t i = 0; i < seed_count; ++i) + { + unsigned int index = seeds[i]; + + seeds[result] = index; + result += emitted_flags[index] == 0; + } + + return result; +} + +static unsigned int selectSeedTriangle(const unsigned int* seeds, size_t seed_count, const unsigned int* indices, const Cone* triangles, const unsigned int* live_triangles, float cornerx, float cornery, float cornerz) +{ + unsigned int best_seed = ~0u; + unsigned int best_live = ~0u; + float best_score = FLT_MAX; + + for (size_t i = 0; i < seed_count; ++i) + { + unsigned int index = seeds[i]; + unsigned int a = indices[index * 3 + 0], b = indices[index * 3 + 1], c = indices[index * 3 + 2]; + + unsigned int live = live_triangles[a] + live_triangles[b] + live_triangles[c]; + float score = getDistance(triangles[index].px - cornerx, triangles[index].py - cornery, triangles[index].pz - cornerz, false); + + if (live < best_live || (live == best_live && score < best_score)) + { + best_seed = index; + best_live = live; + best_score = score; + } + } + + return best_seed; +} + struct KDNode { union @@ -467,7 +657,7 @@ static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size); } -static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit) +static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, bool aa, unsigned int& result, float& limit) { const KDNode& node = nodes[root]; @@ -483,11 +673,8 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, const float* point = points + index * stride; - float distance2 = - (point[0] - position[0]) * (point[0] - position[0]) + - (point[1] - position[1]) * (point[1] - position[1]) + - (point[2] - position[2]) * (point[2] - position[2]); - float distance = sqrtf(distance2); + float dx = point[0] - position[0], dy = point[1] - position[1], dz = point[2] - position[2]; + float distance = getDistance(dx, dy, dz, aa); if (distance < limit) { @@ -503,11 +690,11 @@ static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, unsigned int first = (delta <= 0) ? 0 : node.children; unsigned int second = first ^ node.children; - kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit); + kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, aa, result, limit); // only process the other node if it can have a match based on closest distance so far if (fabsf(delta) <= limit) - kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit); + kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, aa, result, limit); } } @@ -535,7 +722,7 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_ return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles; } -size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) +size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor) { using namespace meshopt; @@ -544,18 +731,25 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve assert(vertex_positions_stride % sizeof(float) == 0); assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); - assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); - assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + assert(min_triangles >= 1 && min_triangles <= max_triangles && max_triangles <= kMeshletMaxTriangles); + assert(min_triangles % 4 == 0 && max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned - assert(cone_weight >= 0 && cone_weight <= 1); + assert(cone_weight <= 1); // negative cone weight switches metric to optimize for axis-aligned meshlets + assert(split_factor >= 0); + + if (index_count == 0) + return 0; meshopt_Allocator allocator; TriangleAdjacency2 adjacency = {}; - buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); + if (vertex_count > index_count && index_count < (1u << 31)) + buildTriangleAdjacencySparse(adjacency, indices, index_count, vertex_count, allocator); + else + buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); - unsigned int* live_triangles = allocator.allocate(vertex_count); - memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); + // live triangle counts; note, we alias adjacency.counts as we remove triangles after emitting them so the counts always match + unsigned int* live_triangles = adjacency.counts; size_t face_count = index_count / 3; @@ -578,9 +772,42 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve KDNode* nodes = allocator.allocate(face_count * 2); kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8); - // index of the vertex in the meshlet, 0xff if the vertex isn't used - unsigned char* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count); + // find a specific corner of the mesh to use as a starting point for meshlet flow + float cornerx = FLT_MAX, cornery = FLT_MAX, cornerz = FLT_MAX; + + for (size_t i = 0; i < face_count; ++i) + { + const Cone& tri = triangles[i]; + + cornerx = cornerx > tri.px ? tri.px : cornerx; + cornery = cornery > tri.py ? tri.py : cornery; + cornerz = cornerz > tri.pz ? tri.pz : cornerz; + } + + // index of the vertex in the meshlet, -1 if the vertex isn't used + short* used = allocator.allocate(vertex_count); + memset(used, -1, vertex_count * sizeof(short)); + + // initial seed triangle is the one closest to the corner + unsigned int initial_seed = ~0u; + float initial_score = FLT_MAX; + + for (size_t i = 0; i < face_count; ++i) + { + const Cone& tri = triangles[i]; + + float score = getDistance(tri.px - cornerx, tri.py - cornery, tri.pz - cornerz, false); + + if (initial_seed == ~0u || score < initial_score) + { + initial_seed = unsigned(i); + initial_score = score; + } + } + + // seed triangles to continue meshlet flow + unsigned int seeds[kMeshletMaxSeeds] = {}; + size_t seed_count = 0; meshopt_Meshlet meshlet = {}; size_t meshlet_offset = 0; @@ -591,46 +818,61 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve { Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count); - unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight); - int best_extra = best_triangle == ~0u ? -1 : (used[indices[best_triangle * 3 + 0]] == 0xff) + (used[indices[best_triangle * 3 + 1]] == 0xff) + (used[indices[best_triangle * 3 + 2]] == 0xff); + unsigned int best_triangle = ~0u; - // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring - if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) - { - best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f); - } + // for the first triangle, we don't have a meshlet cone yet, so we use the initial seed + // to continue the meshlet, we select an adjacent triangle based on connectivity and spatial scoring + if (meshlet_offset == 0 && meshlet.triangle_count == 0) + best_triangle = initial_seed; + else + best_triangle = getNeighborTriangle(meshlet, meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight); - // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity + bool split = false; + + // when we run out of adjacent triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity if (best_triangle == ~0u) { float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz}; unsigned int index = ~0u; - float limit = FLT_MAX; + float distance = FLT_MAX; - kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit); + kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, cone_weight < 0.f, index, distance); best_triangle = index; + split = meshlet.triangle_count >= min_triangles && split_factor > 0 && distance > meshlet_expected_radius * split_factor; } if (best_triangle == ~0u) break; + int best_extra = (used[indices[best_triangle * 3 + 0]] < 0) + (used[indices[best_triangle * 3 + 1]] < 0) + (used[indices[best_triangle * 3 + 2]] < 0); + + // if the best triangle doesn't fit into current meshlet, we re-select using seeds to maintain global flow + if (split || (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) + { + seed_count = pruneSeedTriangles(seeds, seed_count, emitted_flags); + seed_count = (seed_count + kMeshletAddSeeds <= kMeshletMaxSeeds) ? seed_count : kMeshletMaxSeeds - kMeshletAddSeeds; + seed_count += appendSeedTriangles(seeds + seed_count, meshlet, meshlet_vertices, indices, adjacency, triangles, live_triangles, cornerx, cornery, cornerz); + + unsigned int best_seed = selectSeedTriangle(seeds, seed_count, indices, triangles, live_triangles, cornerx, cornery, cornerz); + + // we may not find a valid seed triangle if the mesh is disconnected as seeds are based on adjacency + best_triangle = best_seed != ~0u ? best_seed : best_triangle; + } + unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2]; assert(a < vertex_count && b < vertex_count && c < vertex_count); // add meshlet to the output; when the current meshlet is full we reset the accumulated bounds - if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles)) + if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles, split)) { meshlet_offset++; memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc)); } - live_triangles[a]--; - live_triangles[b]--; - live_triangles[c]--; - // remove emitted triangle from adjacency data // this makes sure that we spend less time traversing these lists on subsequent iterations + // live triangle counts are updated as a byproduct of these adjustments for (size_t k = 0; k < 3; ++k) { unsigned int index = indices[best_triangle * 3 + k]; @@ -659,6 +901,7 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve meshlet_cone_acc.ny += triangles[best_triangle].ny; meshlet_cone_acc.nz += triangles[best_triangle].nz; + assert(!emitted_flags[best_triangle]); emitted_flags[best_triangle] = 1; } @@ -669,10 +912,17 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve meshlets[meshlet_offset++] = meshlet; } - assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, min_triangles)); return meshlet_offset; } +size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight) +{ + assert(cone_weight >= 0); // to use negative cone weight, use meshopt_buildMeshletsFlex + + return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, max_triangles, cone_weight, 0.0f); +} + size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) { using namespace meshopt; @@ -685,9 +935,9 @@ size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshle meshopt_Allocator allocator; - // index of the vertex in the meshlet, 0xff if the vertex isn't used - unsigned char* used = allocator.allocate(vertex_count); - memset(used, -1, vertex_count); + // index of the vertex in the meshlet, -1 if the vertex isn't used + short* used = allocator.allocate(vertex_count); + memset(used, -1, vertex_count * sizeof(short)); meshopt_Meshlet meshlet = {}; size_t meshlet_offset = 0; @@ -768,15 +1018,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t if (triangles == 0) return bounds; + const float rzero = 0.f; + // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well float psphere[4] = {}; - computeBoundingSphere(psphere, corners[0], triangles * 3); + computeBoundingSphere(psphere, corners[0][0], triangles * 3, sizeof(float) * 3, &rzero, 0); float center[3] = {psphere[0], psphere[1], psphere[2]}; // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis float nsphere[4] = {}; - computeBoundingSphere(nsphere, normals, triangles); + computeBoundingSphere(nsphere, normals[0], triangles, sizeof(float) * 3, &rzero, 0); float axis[3] = {nsphere[0], nsphere[1], nsphere[2]}; float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]); @@ -886,6 +1138,33 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride); } +meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride) +{ + using namespace meshopt; + + assert(positions_stride >= 12 && positions_stride <= 256); + assert(positions_stride % sizeof(float) == 0); + assert((radii_stride >= 4 && radii_stride <= 256) || radii == NULL); + assert(radii_stride % sizeof(float) == 0); + + meshopt_Bounds bounds = {}; + + if (count == 0) + return bounds; + + const float rzero = 0.f; + + float psphere[4] = {}; + computeBoundingSphere(psphere, positions, count, positions_stride, radii ? radii : &rzero, radii ? radii_stride : 0); + + bounds.center[0] = psphere[0]; + bounds.center[1] = psphere[1]; + bounds.center[2] = psphere[2]; + bounds.radius = psphere[3]; + + return bounds; +} + void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count) { using namespace meshopt; @@ -953,23 +1232,23 @@ void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* mesh // reorder meshlet vertices for access locality assuming index buffer is scanned sequentially unsigned int order[kMeshletMaxVertices]; - unsigned char remap[kMeshletMaxVertices]; - memset(remap, -1, vertex_count); + short remap[kMeshletMaxVertices]; + memset(remap, -1, vertex_count * sizeof(short)); size_t vertex_offset = 0; for (size_t i = 0; i < triangle_count * 3; ++i) { - unsigned char& r = remap[indices[i]]; + short& r = remap[indices[i]]; - if (r == 0xff) + if (r < 0) { - r = (unsigned char)(vertex_offset); + r = short(vertex_offset); order[vertex_offset] = vertices[indices[i]]; vertex_offset++; } - indices[i] = r; + indices[i] = (unsigned char)r; } assert(vertex_offset <= vertex_count); diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp index b3004600523..b4fdfe16d5c 100644 --- a/thirdparty/meshoptimizer/indexcodec.cpp +++ b/thirdparty/meshoptimizer/indexcodec.cpp @@ -14,6 +14,7 @@ const unsigned char kIndexHeader = 0xe0; const unsigned char kSequenceHeader = 0xd0; static int gEncodeIndexVersion = 1; +const int kDecodeIndexVersion = 1; typedef unsigned int VertexFifo[16]; typedef unsigned int EdgeFifo[16][2]; @@ -354,11 +355,28 @@ size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count) void meshopt_encodeIndexVersion(int version) { - assert(unsigned(version) <= 1); + assert(unsigned(version) <= unsigned(meshopt::kDecodeIndexVersion)); meshopt::gEncodeIndexVersion = version; } +int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size) +{ + if (buffer_size < 1) + return -1; + + unsigned char header = buffer[0]; + + if ((header & 0xf0) != meshopt::kIndexHeader && (header & 0xf0) != meshopt::kSequenceHeader) + return -1; + + int version = header & 0x0f; + if (version > meshopt::kDecodeIndexVersion) + return -1; + + return version; +} + int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size) { using namespace meshopt; @@ -374,7 +392,7 @@ int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t inde return -1; int version = buffer[0] & 0x0f; - if (version > 1) + if (version > kDecodeIndexVersion) return -1; EdgeFifo edgefifo; @@ -627,7 +645,7 @@ int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t in return -1; int version = buffer[0] & 0x0f; - if (version > 1) + if (version > kDecodeIndexVersion) return -1; const unsigned char* data = buffer + 1; diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h index 77be5371fc7..295324c784d 100644 --- a/thirdparty/meshoptimizer/meshoptimizer.h +++ b/thirdparty/meshoptimizer/meshoptimizer.h @@ -1,7 +1,7 @@ /** - * meshoptimizer - version 0.22 + * meshoptimizer - version 0.23 * - * Copyright (C) 2016-2024, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Copyright (C) 2016-2025, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer * * This library is distributed under the MIT License. See notice at the end of this file. @@ -12,7 +12,7 @@ #include /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 220 /* 0.22 */ +#define MESHOPTIMIZER_VERSION 230 /* 0.23 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API @@ -243,6 +243,13 @@ MESHOPTIMIZER_API void meshopt_encodeIndexVersion(int version); */ MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); +/** + * Get encoded index format version + * Returns format version of the encoded index buffer/sequence, or -1 if the buffer header is invalid + * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed. + */ +MESHOPTIMIZER_API int meshopt_decodeIndexVersion(const unsigned char* buffer, size_t buffer_size); + /** * Index sequence encoder * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original. @@ -277,9 +284,19 @@ MESHOPTIMIZER_API int meshopt_decodeIndexSequence(void* destination, size_t inde MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size); MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); +/** + * Experimental: Vertex buffer encoder + * Encodes vertex data just like meshopt_encodeVertexBuffer, but allows to override compression level. + * For compression level to take effect, the vertex encoding version must be set to 1 via meshopt_encodeVertexVersion. + * The default compression level implied by meshopt_encodeVertexBuffer is 2. + * + * level should be in the range [0, 3] with 0 being the fastest and 3 being the slowest and producing the best compression ratio. + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level); + /** * Set vertex encoder format version - * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) + * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.23+) */ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); @@ -293,6 +310,13 @@ MESHOPTIMIZER_API void meshopt_encodeVertexVersion(int version); */ MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size); +/** + * Get encoded vertex format version + * Returns format version of the encoded vertex buffer, or -1 if the buffer header is invalid + * Note that a non-negative value doesn't guarantee that the buffer will be decoded correctly if the input is malformed. + */ +MESHOPTIMIZER_API int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size); + /** * Vertex buffer filters * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place. @@ -334,7 +358,7 @@ enum meshopt_EncodeExpMode meshopt_EncodeExpSharedVector, /* When encoding exponents, use shared value for each component of all vectors (best compression) */ meshopt_EncodeExpSharedComponent, - /* Experimental: When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */ + /* When encoding exponents, use separate values for each component, but clamp to 0 (good quality if very small values are not important) */ meshopt_EncodeExpClamped, }; @@ -375,7 +399,7 @@ enum MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** - * Experimental: Mesh simplifier with attribute metric + * Mesh simplifier with attribute metric * The algorithm enhances meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details. * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes. * @@ -384,7 +408,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig * attribute_count must be <= 32 * vertex_lock can be NULL; when it's not NULL, it should have a value for each vertex; 1 denotes vertices that can't be moved */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); +MESHOPTIMIZER_API size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, const unsigned char* vertex_lock, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** * Experimental: Mesh simplifier (sloppy) @@ -402,7 +426,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* d MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); /** - * Experimental: Point cloud simplifier + * Point cloud simplifier * Reduces the number of points in the cloud to reach the given target * Returns the number of points after simplification, with destination containing new index data * The resulting index buffer references vertices from the original vertex buffer. @@ -410,10 +434,10 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati * * destination must contain enough space for the target index buffer (target_vertex_count elements) * vertex_positions should have float3 position in the first 12 bytes of each vertex - * vertex_colors should can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex + * vertex_colors can be NULL; when it's not NULL, it should have float3 color in the first 12 bytes of each vertex * color_weight determines relative priority of color wrt position; 1.0 is a safe default */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count); +MESHOPTIMIZER_API size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_colors, size_t vertex_colors_stride, float color_weight, size_t target_vertex_count); /** * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents @@ -520,7 +544,7 @@ struct meshopt_Meshlet * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 * vertex_positions should have float3 position in the first 12 bytes of each vertex - * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512; max_triangles must be divisible by 4) + * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; max_triangles must be divisible by 4) * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency */ MESHOPTIMIZER_API size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight); @@ -528,14 +552,30 @@ MESHOPTIMIZER_API size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshl MESHOPTIMIZER_API size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); /** - * Experimental: Meshlet optimizer + * Experimental: Meshlet builder with flexible cluster sizes + * Splits the mesh into a set of meshlets, similarly to meshopt_buildMeshlets, but allows to specify minimum and maximum number of triangles per meshlet. + * Clusters between min and max triangle counts are split when the cluster size would have exceeded the expected cluster size by more than split_factor. + * Additionally, allows to switch to axis aligned clusters by setting cone_weight to a negative value. + * + * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound using min_triangles (not max!) + * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices + * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * max_vertices, min_triangles and max_triangles must not exceed implementation limits (max_vertices <= 256, max_triangles <= 512; min_triangles <= max_triangles; both min_triangles and max_triangles must be divisible by 4) + * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency; additionally, cone_weight can be set to a negative value to prioritize axis aligned clusters (for raytracing) instead + * split_factor should be set to a non-negative value; when greater than 0, clusters that have large bounds may be split unless they are under the min_triangles threshold + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsFlex(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); + +/** + * Meshlet optimizer * Reorders meshlet vertices and triangles to maximize locality to improve rasterizer throughput * * meshlet_triangles and meshlet_vertices must refer to meshlet triangle and vertex index data; when buildMeshlets* is used, these * need to be computed from meshlet's vertex_offset and triangle_offset - * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 255 - not 256!, triangle_count <= 512) + * triangle_count and vertex_count must not exceed implementation limits (vertex_count <= 256, triangle_count <= 512) */ -MESHOPTIMIZER_EXPERIMENTAL void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count); +MESHOPTIMIZER_API void meshopt_optimizeMeshlet(unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t triangle_count, size_t vertex_count); struct meshopt_Bounds { @@ -579,6 +619,27 @@ struct meshopt_Bounds MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +/** + * Experimental: Sphere bounds generator + * Creates bounding sphere around a set of points or a set of spheres; returns the center and radius of the sphere, with other fields of the result set to 0. + * + * positions should have float3 position in the first 12 bytes of each element + * radii can be NULL; when it's not NULL, it should have a non-negative float radius in the first 4 bytes of each element + */ +MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeSphereBounds(const float* positions, size_t count, size_t positions_stride, const float* radii, size_t radii_stride); + +/** + * Experimental: Cluster partitioner + * Partitions clusters into groups of similar size, prioritizing grouping clusters that share vertices. + * + * destination must contain enough space for the resulting partiotion data (cluster_count elements) + * destination[i] will contain the partition id for cluster i, with the total number of partitions returned by the function + * cluster_indices should have the vertex indices referenced by each cluster, stored sequentially + * cluster_index_counts should have the number of indices in each cluster; sum of all cluster_index_counts must be equal to total_index_count + * target_partition_size is a target size for each partition, in clusters; the resulting partitions may be smaller or larger + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size); + /** * Spatial sorter * Generates a remap table that can be used to reorder points for spatial locality. @@ -598,34 +659,6 @@ MESHOPTIMIZER_API void meshopt_spatialSortRemap(unsigned int* destination, const */ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); -/** - * Set allocation callbacks - * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library. - * Note that all algorithms only allocate memory for temporary use. - * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. - */ -MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -/* Quantization into commonly supported data formats */ -#ifdef __cplusplus -/** - * Quantize a float in [0..1] range into an N-bit fixed point unorm value - * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion - * Maximum reconstruction error: 1/2^(N+1) - */ -inline int meshopt_quantizeUnorm(float v, int N); - -/** - * Quantize a float in [-1..1] range into an N-bit fixed point snorm value - * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions) - * Maximum reconstruction error: 1/2^N - */ -inline int meshopt_quantizeSnorm(float v, int N); - /** * Quantize a float into half-precision (as defined by IEEE-754 fp16) floating point value * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest @@ -646,6 +679,34 @@ MESHOPTIMIZER_API float meshopt_quantizeFloat(float v, int N); * Preserves Inf/NaN, flushes denormals to zero */ MESHOPTIMIZER_API float meshopt_dequantizeHalf(unsigned short h); + +/** + * Set allocation callbacks + * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library. + * Note that all algorithms only allocate memory for temporary use. + * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. + */ +MESHOPTIMIZER_API void meshopt_setAllocator(void* (MESHOPTIMIZER_ALLOC_CALLCONV* allocate)(size_t), void (MESHOPTIMIZER_ALLOC_CALLCONV* deallocate)(void*)); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +/* Quantization into fixed point normalized formats; these are only available as inline C++ functions */ +#ifdef __cplusplus +/** + * Quantize a float in [0..1] range into an N-bit fixed point unorm value + * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion + * Maximum reconstruction error: 1/2^(N+1) + */ +inline int meshopt_quantizeUnorm(float v, int N); + +/** + * Quantize a float in [-1..1] range into an N-bit fixed point snorm value + * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions) + * Maximum reconstruction error: 1/2^N + */ +inline int meshopt_quantizeSnorm(float v, int N); #endif /** @@ -714,8 +775,12 @@ inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* mes template inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); template +inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor); +template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); template +inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size); +template inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); #endif @@ -1094,6 +1159,14 @@ inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles); } +template +inline size_t meshopt_buildMeshletsFlex(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t min_triangles, size_t max_triangles, float cone_weight, float split_factor) +{ + meshopt_IndexAdapter in(NULL, indices, index_count); + + return meshopt_buildMeshletsFlex(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, min_triangles, max_triangles, cone_weight, split_factor); +} + template inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { @@ -1102,6 +1175,14 @@ inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t inde return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); } +template +inline size_t meshopt_partitionClusters(unsigned int* destination, const T* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size) +{ + meshopt_IndexAdapter in(NULL, cluster_indices, total_index_count); + + return meshopt_partitionClusters(destination, in.data, total_index_count, cluster_index_counts, cluster_count, vertex_count, target_partition_size); +} + template inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) { @@ -1113,7 +1194,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_ #endif /** - * Copyright (c) 2016-2024 Arseny Kapoulkine + * Copyright (c) 2016-2025 Arseny Kapoulkine * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/thirdparty/meshoptimizer/partition.cpp b/thirdparty/meshoptimizer/partition.cpp new file mode 100644 index 00000000000..9c229980552 --- /dev/null +++ b/thirdparty/meshoptimizer/partition.cpp @@ -0,0 +1,429 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include +#include +#include + +namespace meshopt +{ + +struct ClusterAdjacency +{ + unsigned int* offsets; + unsigned int* clusters; + unsigned int* shared; +}; + +static void buildClusterAdjacency(ClusterAdjacency& adjacency, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, size_t cluster_count, unsigned char* used, size_t vertex_count, meshopt_Allocator& allocator) +{ + unsigned int* ref_offsets = allocator.allocate(vertex_count + 1); + + // compute number of clusters referenced by each vertex + memset(ref_offsets, 0, vertex_count * sizeof(unsigned int)); + + for (size_t i = 0; i < cluster_count; ++i) + { + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + { + unsigned int v = cluster_indices[j]; + assert(v < vertex_count); + + ref_offsets[v] += 1 - used[v]; + used[v] = 1; + } + + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + used[cluster_indices[j]] = 0; + } + + // compute (worst-case) number of adjacent clusters for each cluster + size_t total_adjacency = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + size_t count = 0; + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + { + unsigned int v = cluster_indices[j]; + assert(v < vertex_count); + + // worst case is every vertex has a disjoint cluster list + count += used[v] ? 0 : ref_offsets[v] - 1; + used[v] = 1; + } + + // ... but only every other cluster can be adjacent in the end + total_adjacency += count < cluster_count - 1 ? count : cluster_count - 1; + + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + used[cluster_indices[j]] = 0; + } + + // we can now allocate adjacency buffers + adjacency.offsets = allocator.allocate(cluster_count + 1); + adjacency.clusters = allocator.allocate(total_adjacency); + adjacency.shared = allocator.allocate(total_adjacency); + + // convert ref counts to offsets + size_t total_refs = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + size_t count = ref_offsets[i]; + ref_offsets[i] = unsigned(total_refs); + total_refs += count; + } + + unsigned int* ref_data = allocator.allocate(total_refs); + + // fill cluster refs for each vertex + for (size_t i = 0; i < cluster_count; ++i) + { + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + { + unsigned int v = cluster_indices[j]; + assert(v < vertex_count); + + if (used[v]) + continue; + + ref_data[ref_offsets[v]++] = unsigned(i); + used[v] = 1; + } + + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + used[cluster_indices[j]] = 0; + } + + // after the previous pass, ref_offsets contain the end of the data for each vertex; shift it forward to get the start + memmove(ref_offsets + 1, ref_offsets, vertex_count * sizeof(unsigned int)); + ref_offsets[0] = 0; + + // fill cluster adjacency for each cluster... + adjacency.offsets[0] = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + unsigned int* adj = adjacency.clusters + adjacency.offsets[i]; + unsigned int* shd = adjacency.shared + adjacency.offsets[i]; + size_t count = 0; + + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + { + unsigned int v = cluster_indices[j]; + assert(v < vertex_count); + + if (used[v]) + continue; + + // merge the entire cluster list of each vertex into current list + for (size_t k = ref_offsets[v]; k < ref_offsets[v + 1]; ++k) + { + unsigned int c = ref_data[k]; + assert(c < cluster_count); + + if (c == unsigned(i)) + continue; + + // if the cluster is already in the list, increment the shared count + bool found = false; + for (size_t l = 0; l < count; ++l) + if (adj[l] == c) + { + found = true; + shd[l]++; + break; + } + + // .. or append a new cluster + if (!found) + { + adj[count] = c; + shd[count] = 1; + count++; + } + } + + used[v] = 1; + } + + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + used[cluster_indices[j]] = 0; + + // mark the end of the adjacency list; the next cluster will start there as well + adjacency.offsets[i + 1] = adjacency.offsets[i] + unsigned(count); + } + + assert(adjacency.offsets[cluster_count] <= total_adjacency); + + // ref_offsets can't be deallocated as it was allocated before adjacency + allocator.deallocate(ref_data); +} + +struct ClusterGroup +{ + int group; + int next; + unsigned int size; // 0 unless root + unsigned int vertices; +}; + +struct GroupOrder +{ + unsigned int id; + int order; +}; + +static void heapPush(GroupOrder* heap, size_t size, GroupOrder item) +{ + // insert a new element at the end (breaks heap invariant) + heap[size++] = item; + + // bubble up the new element to its correct position + size_t i = size - 1; + while (i > 0 && heap[i].order < heap[(i - 1) / 2].order) + { + size_t p = (i - 1) / 2; + + GroupOrder temp = heap[i]; + heap[i] = heap[p]; + heap[p] = temp; + i = p; + } +} + +static GroupOrder heapPop(GroupOrder* heap, size_t size) +{ + assert(size > 0); + GroupOrder top = heap[0]; + + // move the last element to the top (breaks heap invariant) + heap[0] = heap[--size]; + + // bubble down the new top element to its correct position + size_t i = 0; + while (i * 2 + 1 < size) + { + // find the smallest child + size_t j = i * 2 + 1; + j += (j + 1 < size && heap[j + 1].order < heap[j].order); + + // if the parent is already smaller than both children, we're done + if (heap[j].order >= heap[i].order) + break; + + // otherwise, swap the parent and child and continue + GroupOrder temp = heap[i]; + heap[i] = heap[j]; + heap[j] = temp; + i = j; + } + + return top; +} + +static unsigned int countTotal(const ClusterGroup* groups, int id, const unsigned int* cluster_indices, const unsigned int* cluster_offsets, unsigned char* used) +{ + unsigned int total = 0; + + for (int i = id; i >= 0; i = groups[i].next) + { + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + { + unsigned int v = cluster_indices[j]; + total += 1 - used[v]; + used[v] = 1; + } + } + + for (int i = id; i >= 0; i = groups[i].next) + { + for (size_t j = cluster_offsets[i]; j < cluster_offsets[i + 1]; ++j) + used[cluster_indices[j]] = 0; + } + + return total; +} + +static unsigned int countShared(const ClusterGroup* groups, int group1, int group2, const ClusterAdjacency& adjacency) +{ + unsigned int total = 0; + + for (int i1 = group1; i1 >= 0; i1 = groups[i1].next) + for (int i2 = group2; i2 >= 0; i2 = groups[i2].next) + { + for (unsigned int adj = adjacency.offsets[i1]; adj < adjacency.offsets[i1 + 1]; ++adj) + if (adjacency.clusters[adj] == unsigned(i2)) + { + total += adjacency.shared[adj]; + break; + } + } + + return total; +} + +static int pickGroupToMerge(const ClusterGroup* groups, int id, const ClusterAdjacency& adjacency, size_t max_partition_size) +{ + assert(groups[id].size > 0); + + float group_rsqrt = 1.f / sqrtf(float(int(groups[id].vertices))); + + int best_group = -1; + float best_score = 0; + + for (int ci = id; ci >= 0; ci = groups[ci].next) + { + for (unsigned int adj = adjacency.offsets[ci]; adj != adjacency.offsets[ci + 1]; ++adj) + { + int other = groups[adjacency.clusters[adj]].group; + if (other < 0) + continue; + + assert(groups[other].size > 0); + if (groups[id].size + groups[other].size > max_partition_size) + continue; + + unsigned int shared = countShared(groups, id, other, adjacency); + float other_rsqrt = 1.f / sqrtf(float(int(groups[other].vertices))); + + // normalize shared count by the expected boundary of each group (+ keeps scoring symmetric) + float score = float(int(shared)) * (group_rsqrt + other_rsqrt); + + if (score > best_score) + { + best_group = other; + best_score = score; + } + } + } + + return best_group; +} + +} // namespace meshopt + +size_t meshopt_partitionClusters(unsigned int* destination, const unsigned int* cluster_indices, size_t total_index_count, const unsigned int* cluster_index_counts, size_t cluster_count, size_t vertex_count, size_t target_partition_size) +{ + using namespace meshopt; + + assert(target_partition_size > 0); + + size_t max_partition_size = target_partition_size + target_partition_size * 3 / 8; + + meshopt_Allocator allocator; + + unsigned char* used = allocator.allocate(vertex_count); + memset(used, 0, vertex_count); + + // build cluster index offsets as a prefix sum + unsigned int* cluster_offsets = allocator.allocate(cluster_count + 1); + unsigned int cluster_nextoffset = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + assert(cluster_index_counts[i] > 0); + + cluster_offsets[i] = cluster_nextoffset; + cluster_nextoffset += cluster_index_counts[i]; + } + + assert(cluster_nextoffset == total_index_count); + cluster_offsets[cluster_count] = unsigned(total_index_count); + + // build cluster adjacency along with edge weights (shared vertex count) + ClusterAdjacency adjacency = {}; + buildClusterAdjacency(adjacency, cluster_indices, cluster_offsets, cluster_count, used, vertex_count, allocator); + + ClusterGroup* groups = allocator.allocate(cluster_count); + + GroupOrder* order = allocator.allocate(cluster_count); + size_t pending = 0; + + // create a singleton group for each cluster and order them by priority + for (size_t i = 0; i < cluster_count; ++i) + { + groups[i].group = int(i); + groups[i].next = -1; + groups[i].size = 1; + groups[i].vertices = countTotal(groups, int(i), cluster_indices, cluster_offsets, used); + + GroupOrder item = {}; + item.id = unsigned(i); + item.order = groups[i].vertices; + + heapPush(order, pending++, item); + } + + // iteratively merge the smallest group with the best group + while (pending) + { + GroupOrder top = heapPop(order, pending--); + + // this group was merged into another group earlier + if (groups[top.id].size == 0) + continue; + + // disassociate clusters from the group to prevent them from being merged again; we will re-associate them if the group is reinserted + for (int i = top.id; i >= 0; i = groups[i].next) + { + assert(groups[i].group == int(top.id)); + groups[i].group = -1; + } + + // the group is large enough, emit as is + if (groups[top.id].size >= target_partition_size) + continue; + + int best_group = pickGroupToMerge(groups, top.id, adjacency, max_partition_size); + + // we can't grow the group any more, emit as is + if (best_group == -1) + continue; + + // compute shared vertices to adjust the total vertices estimate after merging + unsigned int shared = countShared(groups, top.id, best_group, adjacency); + + // combine groups by linking them together + assert(groups[best_group].size > 0); + + for (int i = top.id; i >= 0; i = groups[i].next) + if (groups[i].next < 0) + { + groups[i].next = best_group; + break; + } + + // update group sizes; note, the vertex update is an approximation which avoids recomputing the true size via countTotal + groups[top.id].size += groups[best_group].size; + groups[top.id].vertices += groups[best_group].vertices; + groups[top.id].vertices = (groups[top.id].vertices > shared) ? groups[top.id].vertices - shared : 1; + + groups[best_group].size = 0; + groups[best_group].vertices = 0; + + // re-associate all clusters back to the merged group + for (int i = top.id; i >= 0; i = groups[i].next) + groups[i].group = int(top.id); + + top.order = groups[top.id].vertices; + heapPush(order, pending++, top); + } + + size_t next_group = 0; + + for (size_t i = 0; i < cluster_count; ++i) + { + if (groups[i].size == 0) + continue; + + for (int j = int(i); j >= 0; j = groups[j].next) + destination[j] = unsigned(next_group); + + next_group++; + } + + assert(next_group <= cluster_count); + return next_group; +} diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp index af64cbda49d..cf0a8a1878f 100644 --- a/thirdparty/meshoptimizer/simplifier.cpp +++ b/thirdparty/meshoptimizer/simplifier.cpp @@ -437,8 +437,13 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned { // vertex_lock may lock any wedge, not just the primary vertex, so we need to lock the primary vertex and relock any wedges for (size_t i = 0; i < vertex_count; ++i) - if (vertex_lock[sparse_remap ? sparse_remap[i] : i]) + { + unsigned int ri = sparse_remap ? sparse_remap[i] : unsigned(i); + assert(vertex_lock[ri] <= 1); // values other than 0/1 are reserved for future use + + if (vertex_lock[ri]) result[remap[i]] = Kind_Locked; + } for (size_t i = 0; i < vertex_count; ++i) if (result[remap[i]] == Kind_Locked) @@ -1026,7 +1031,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, c return collapse_count; } -static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap) +static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback) { for (size_t i = 0; i < collapse_count; ++i) { @@ -1041,7 +1046,7 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const unsigned int j1 = c.bidi ? i0 : i1; float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]); - float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]); + float ej = c.bidi ? quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]) : FLT_MAX; #if TRACE >= 3 float di = ei, dj = ej; @@ -1049,9 +1054,25 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const if (attribute_count) { - // note: ideally we would evaluate max/avg of attribute errors for seam edges, but it's not clear if it's worth the extra cost ei += quadricError(attribute_quadrics[i0], &attribute_gradients[i0 * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]); - ej += quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]); + ej += c.bidi ? quadricError(attribute_quadrics[j0], &attribute_gradients[j0 * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]) : 0; + + // note: seam edges need to aggregate attribute errors between primary and secondary edges, as attribute quadrics are separate + if (vertex_kind[i0] == Kind_Seam) + { + // for seam collapses we need to find the seam pair; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) + unsigned int s0 = wedge[i0]; + unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; + + assert(s0 != i0 && wedge[s0] == i0); + assert(s1 != ~0u && remap[s1] == remap[i1]); + + // note: this should never happen due to the assertion above, but when disabled if we ever hit this case we'll get a memory safety issue; for now play it safe + s1 = (s1 != ~0u) ? s1 : wedge[i1]; + + ei += quadricError(attribute_quadrics[s0], &attribute_gradients[s0 * attribute_count], attribute_count, vertex_positions[s1], &vertex_attributes[s1 * attribute_count]); + ej += c.bidi ? quadricError(attribute_quadrics[s1], &attribute_gradients[s1 * attribute_count], attribute_count, vertex_positions[s0], &vertex_attributes[s0 * attribute_count]) : 0; + } } // pick edge direction with minimal error @@ -1206,7 +1227,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* } else if (kind == Kind_Seam) { - // for seam collapses we need to move the seam pair together; this is a bit tricky to compute since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) + // for seam collapses we need to move the seam pair together; this is a bit tricky since we need to rely on edge loops as target vertex may be locked (and thus have more than two wedges) unsigned int s0 = wedge[i0]; unsigned int s1 = loop[i0] == i1 ? loopback[s0] : loop[s0]; assert(s0 != i0 && wedge[s0] == i0); @@ -1964,7 +1985,7 @@ size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indic printf("pass %d:%c", int(pass_count++), TRACE >= 2 ? '\n' : ' '); #endif - rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap); + rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap, wedge, vertex_kind, loop, loopback); sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count); diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp index 1dbd2e35f80..53cf9d753c0 100644 --- a/thirdparty/meshoptimizer/vertexcodec.cpp +++ b/thirdparty/meshoptimizer/vertexcodec.cpp @@ -60,6 +60,15 @@ #define SIMD_LATENCYOPT #endif +// In switch dispatch, marking default case as unreachable allows to remove redundant bounds checks +#if defined(__GNUC__) +#define SIMD_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define SIMD_UNREACHABLE() __assume(false) +#else +#define SIMD_UNREACHABLE() assert(!"Unreachable") +#endif + #endif // !MESHOPTIMIZER_NO_SIMD #ifdef SIMD_SSE @@ -114,33 +123,44 @@ namespace meshopt const unsigned char kVertexHeader = 0xa0; static int gEncodeVertexVersion = 0; +const int kDecodeVertexVersion = 1; const size_t kVertexBlockSizeBytes = 8192; const size_t kVertexBlockMaxSize = 256; const size_t kByteGroupSize = 16; const size_t kByteGroupDecodeLimit = 24; -const size_t kTailMaxSize = 32; +const size_t kTailMinSizeV0 = 32; +const size_t kTailMinSizeV1 = 24; + +static const int kBitsV0[4] = {0, 2, 4, 8}; +static const int kBitsV1[5] = {0, 1, 2, 4, 8}; + +const int kEncodeDefaultLevel = 2; static size_t getVertexBlockSize(size_t vertex_size) { - // make sure the entire block fits into the scratch buffer - size_t result = kVertexBlockSizeBytes / vertex_size; - - // align to byte group size; we encode each byte as a byte group - // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size - result &= ~(kByteGroupSize - 1); + // make sure the entire block fits into the scratch buffer and is aligned to byte group size + // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility + size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; } -inline unsigned char zigzag8(unsigned char v) +inline unsigned int rotate(unsigned int v, int r) { - return ((signed char)(v) >> 7) ^ (v << 1); + return (v << r) | (v >> ((32 - r) & 31)); } -inline unsigned char unzigzag8(unsigned char v) +template +inline T zigzag(T v) { - return -(v & 1) ^ (v >> 1); + return (0 - (v >> (sizeof(T) * 8 - 1))) ^ (v << 1); +} + +template +inline T unzigzag(T v) +{ + return (0 - (v & 1)) ^ (v >> 1); } #if TRACE @@ -148,8 +168,9 @@ struct Stats { size_t size; size_t header; // bytes for header - size_t bitg[4]; // bytes for bit groups + size_t bitg[9]; // bytes for bit groups size_t bitc[8]; // bit consistency: how many bits are shared between all bytes in a group + size_t ctrl[4]; // number of control groups }; static Stats* bytestats = NULL; @@ -158,18 +179,19 @@ static Stats vertexstats[256]; static bool encodeBytesGroupZero(const unsigned char* buffer) { - for (size_t i = 0; i < kByteGroupSize; ++i) - if (buffer[i]) - return false; + assert(kByteGroupSize == sizeof(unsigned long long) * 2); - return true; + unsigned long long v[2]; + memcpy(v, buffer, sizeof(v)); + + return (v[0] | v[1]) == 0; } static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) { - assert(bits >= 1 && bits <= 8); + assert(bits >= 0 && bits <= 8); - if (bits == 1) + if (bits == 0) return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); if (bits == 8) @@ -187,9 +209,10 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits) { - assert(bits >= 1 && bits <= 8); + assert(bits >= 0 && bits <= 8); + assert(kByteGroupSize % 8 == 0); - if (bits == 1) + if (bits == 0) return data; if (bits == 8) @@ -217,21 +240,27 @@ static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* byte |= enc; } + // encode 1-bit groups in reverse bit order + // this makes them faster to decode alongside other groups + if (bits == 1) + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + *data++ = byte; } for (size_t i = 0; i < kByteGroupSize; ++i) { - if (buffer[i] >= sentinel) - { - *data++ = buffer[i]; - } + unsigned char v = buffer[i]; + + // branchless append of out-of-range values + *data = v; + data += v >= sentinel; } return data; } -static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size) +static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size, const int bits[4]) { assert(buffer_size % kByteGroupSize == 0); @@ -247,39 +276,40 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, memset(header, 0, header_size); + int last_bits = -1; + for (size_t i = 0; i < buffer_size; i += kByteGroupSize) { if (size_t(data_end - data) < kByteGroupDecodeLimit) return NULL; - int best_bits = 8; - size_t best_size = encodeBytesGroupMeasure(buffer + i, 8); + int best_bitk = 3; + size_t best_size = encodeBytesGroupMeasure(buffer + i, bits[best_bitk]); - for (int bits = 1; bits < 8; bits *= 2) + for (int bitk = 0; bitk < 3; ++bitk) { - size_t size = encodeBytesGroupMeasure(buffer + i, bits); + size_t size = encodeBytesGroupMeasure(buffer + i, bits[bitk]); - if (size < best_size) + // favor consistent bit selection across groups, but never replace literals + if (size < best_size || (size == best_size && bits[bitk] == last_bits && bits[best_bitk] != 8)) { - best_bits = bits; + best_bitk = bitk; best_size = size; } } - int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2 ? 1 : (best_bits == 4 ? 2 : 3)); - assert((1 << bitslog2) == best_bits); - size_t header_offset = i / kByteGroupSize; + header[header_offset / 4] |= best_bitk << ((header_offset % 4) * 2); - header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2); - + int best_bits = bits[best_bitk]; unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); assert(data + best_size == next); data = next; + last_bits = best_bits; #if TRACE - bytestats->bitg[bitslog2] += best_size; + bytestats->bitg[best_bits] += best_size; #endif } @@ -290,51 +320,252 @@ static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, return data; } -static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +template +static void encodeDeltas1(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int rot) +{ + size_t k0 = k & ~(sizeof(T) - 1); + int ks = (k & (sizeof(T) - 1)) * 8; + + T p = last_vertex[k0]; + for (size_t j = 1; j < sizeof(T); ++j) + p |= T(last_vertex[k0 + j]) << (j * 8); + + const unsigned char* vertex = vertex_data + k0; + + for (size_t i = 0; i < vertex_count; ++i) + { + T v = vertex[0]; + for (size_t j = 1; j < sizeof(T); ++j) + v |= vertex[j] << (j * 8); + + T d = Xor ? T(rotate(v ^ p, rot)) : zigzag(T(v - p)); + + buffer[i] = (unsigned char)(d >> ks); + p = v; + vertex += vertex_size; + } +} + +static void encodeDeltas(unsigned char* buffer, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, const unsigned char last_vertex[256], size_t k, int channel) +{ + switch (channel & 3) + { + case 0: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); + case 1: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, 0); + case 2: + return encodeDeltas1(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, channel >> 4); + default: + assert(!"Unsupported channel encoding"); // unreachable + } +} + +static int estimateBits(unsigned char v) +{ + return v <= 15 ? (v <= 3 ? (v == 0 ? 0 : 2) : 4) : 8; +} + +static int estimateRotate(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t group_size) +{ + size_t sizes[8] = {}; + + const unsigned char* vertex = vertex_data + k; + unsigned int last = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); + + for (size_t i = 0; i < vertex_count; i += group_size) + { + unsigned int bitg = 0; + + // calculate bit consistency mask for the group + for (size_t j = 0; j < group_size && i + j < vertex_count; ++j) + { + unsigned int v = vertex[0] | (vertex[1] << 8) | (vertex[2] << 16) | (vertex[3] << 24); + unsigned int d = v ^ last; + + bitg |= d; + last = v; + vertex += vertex_size; + } + +#if TRACE + for (int j = 0; j < 32; ++j) + vertexstats[k + (j / 8)].bitc[j % 8] += (i + group_size < vertex_count ? group_size : vertex_count - i) * (1 - ((bitg >> j) & 1)); +#endif + + for (int j = 0; j < 8; ++j) + { + unsigned int bitr = rotate(bitg, j); + + sizes[j] += estimateBits((unsigned char)(bitr >> 0)) + estimateBits((unsigned char)(bitr >> 8)); + sizes[j] += estimateBits((unsigned char)(bitr >> 16)) + estimateBits((unsigned char)(bitr >> 24)); + } + } + + int best_rot = 0; + for (int rot = 1; rot < 8; ++rot) + best_rot = (sizes[rot] < sizes[best_rot]) ? rot : best_rot; + + return best_rot; +} + +static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, size_t k, size_t vertex_block_size, size_t block_skip, int max_channel, int xor_rot) +{ + unsigned char block[kVertexBlockMaxSize]; + assert(vertex_block_size <= kVertexBlockMaxSize); + + unsigned char last_vertex[256] = {}; + + size_t sizes[3] = {}; + assert(max_channel <= 3); + + for (size_t i = 0; i < vertex_count; i += vertex_block_size * block_skip) + { + size_t block_size = i + vertex_block_size < vertex_count ? vertex_block_size : vertex_count - i; + size_t block_size_aligned = (block_size + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + + memcpy(last_vertex, vertex_data + (i == 0 ? 0 : i - 1) * vertex_size, vertex_size); + + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize + if (block_size < block_size_aligned) + memset(block + block_size, 0, block_size_aligned - block_size); + + for (int channel = 0; channel < max_channel; ++channel) + for (size_t j = 0; j < 4; ++j) + { + encodeDeltas(block, vertex_data + i * vertex_size, block_size, vertex_size, last_vertex, k + j, channel | (xor_rot << 4)); + + for (size_t ig = 0; ig < block_size; ig += kByteGroupSize) + { + // to maximize encoding performance we only evaluate 1/2/4/8 bit groups + size_t size1 = encodeBytesGroupMeasure(block + ig, 1); + size_t size2 = encodeBytesGroupMeasure(block + ig, 2); + size_t size4 = encodeBytesGroupMeasure(block + ig, 4); + size_t size8 = encodeBytesGroupMeasure(block + ig, 8); + + size_t best_size = size1 < size2 ? size1 : size2; + best_size = best_size < size4 ? best_size : size4; + best_size = best_size < size8 ? best_size : size8; + + sizes[channel] += best_size; + } + } + } + + int best_channel = 0; + for (int channel = 1; channel < max_channel; ++channel) + best_channel = (sizes[channel] < sizes[best_channel]) ? channel : best_channel; + + return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; +} + +static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned) +{ + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + if (!encodeBytesGroupZero(buffer + i)) + return false; + + return true; +} + +static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level) +{ + if (estimateControlZero(buffer, vertex_count_aligned)) + return 2; // zero encoding + + if (level == 0) + return 1; // 1248 encoding in level 0 for encoding speed + + // round number of groups to 4 to get number of header bytes + size_t header_size = (vertex_count_aligned / kByteGroupSize + 3) / 4; + + size_t est_bytes0 = header_size, est_bytes1 = header_size; + + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + { + // assumes kBitsV1[] = {0, 1, 2, 4, 8} for performance + size_t size0 = encodeBytesGroupMeasure(buffer + i, 0); + size_t size1 = encodeBytesGroupMeasure(buffer + i, 1); + size_t size2 = encodeBytesGroupMeasure(buffer + i, 2); + size_t size4 = encodeBytesGroupMeasure(buffer + i, 4); + size_t size8 = encodeBytesGroupMeasure(buffer + i, 8); + + // both control modes have access to 1/2/4 bit encoding + size_t size12 = size1 < size2 ? size1 : size2; + size_t size124 = size12 < size4 ? size12 : size4; + + // each control mode has access to 0/8 bit encoding respectively + est_bytes0 += size124 < size0 ? size124 : size0; + est_bytes1 += size124 < size8 ? size124 : size8; + } + + // pick shortest control entry but prefer literal encoding + if (est_bytes0 < vertex_count || est_bytes1 < vertex_count) + return est_bytes0 < est_bytes1 ? 0 : 1; + else + return 3; // literal encoding +} + +static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version, int level) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + assert(vertex_size % 4 == 0); unsigned char buffer[kVertexBlockMaxSize]; assert(sizeof(buffer) % kByteGroupSize == 0); + size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize memset(buffer, 0, sizeof(buffer)); + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + unsigned char* control = data; + data += control_size; + + memset(control, 0, control_size); + for (size_t k = 0; k < vertex_size; ++k) { - size_t vertex_offset = k; - - unsigned char p = last_vertex[k]; - - for (size_t i = 0; i < vertex_count; ++i) - { - buffer[i] = zigzag8(vertex_data[vertex_offset] - p); - - p = vertex_data[vertex_offset]; - - vertex_offset += vertex_size; - } + encodeDeltas(buffer, vertex_data, vertex_count, vertex_size, last_vertex, k, version == 0 ? 0 : channels[k / 4]); #if TRACE const unsigned char* olddata = data; bytestats = &vertexstats[k]; - - for (size_t ig = 0; ig < vertex_count; ig += kByteGroupSize) - { - unsigned char last = (ig == 0) ? last_vertex[k] : vertex_data[vertex_size * (ig - 1) + k]; - unsigned char delta = 0xff; - - for (size_t i = ig; i < ig + kByteGroupSize && i < vertex_count; ++i) - delta &= ~(vertex_data[vertex_size * i + k] ^ last); - - for (int j = 0; j < 8; ++j) - bytestats->bitc[j] += (vertex_count - ig < kByteGroupSize ? vertex_count - ig : kByteGroupSize) * ((delta >> j) & 1); - } #endif - data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1)); - if (!data) - return NULL; + int ctrl = 0; + + if (version != 0) + { + ctrl = estimateControl(buffer, vertex_count, vertex_count_aligned, level); + + assert(unsigned(ctrl) < 4); + control[k / 4] |= ctrl << ((k % 4) * 2); + +#if TRACE + vertexstats[k].ctrl[ctrl]++; +#endif + } + + if (ctrl == 3) + { + // literal encoding + if (size_t(data_end - data) < vertex_count) + return NULL; + + memcpy(data, buffer, vertex_count); + data += vertex_count; + } + else if (ctrl != 2) // non-zero encoding + { + data = encodeBytes(data, data_end, buffer, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); + if (!data) + return NULL; + } #if TRACE bytestats = NULL; @@ -348,7 +579,7 @@ static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data } #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX) && !defined(SIMD_WASM)) -static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2) +static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bits) { #define READ() byte = *data++ #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1) @@ -356,12 +587,24 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned unsigned char byte, enc, encv; const unsigned char* data_var; - switch (bitslog2) + switch (bits) { case 0: memset(buffer, 0, kByteGroupSize); return data; case 1: + data_var = data + 2; + + // 2 groups with 8 1-bit values in each byte (reversed from the order in other groups) + READ(); + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); + READ(); + byte = (unsigned char)(((byte * 0x80200802ull) & 0x0884422110ull) * 0x0101010101ull >> 32); + NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1), NEXT(1); + + return data_var; + case 2: data_var = data + 4; // 4 groups with 4 2-bit values in each byte @@ -371,7 +614,7 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); return data_var; - case 2: + case 4: data_var = data + 8; // 8 groups with 2 4-bit values in each byte @@ -385,11 +628,11 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned READ(), NEXT(4), NEXT(4); return data_var; - case 3: + case 8: memcpy(buffer, data, kByteGroupSize); return data + kByteGroupSize; default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + assert(!"Unexpected bit length"); // unreachable return data; } @@ -397,18 +640,16 @@ static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned #undef NEXT } -static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, const int* bits) { assert(buffer_size % kByteGroupSize == 0); - const unsigned char* header = data; - // round number of groups to 4 to get number of header bytes size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; - if (size_t(data_end - data) < header_size) return NULL; + const unsigned char* header = data; data += header_size; for (size_t i = 0; i < buffer_size; i += kByteGroupSize) @@ -417,43 +658,108 @@ static const unsigned char* decodeBytes(const unsigned char* data, const unsigne return NULL; size_t header_offset = i / kByteGroupSize; + int bitsk = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - - data = decodeBytesGroup(data, buffer + i, bitslog2); + data = decodeBytesGroup(data, buffer + i, bits[bitsk]); } return data; } -static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +template +static void decodeDeltas1(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count, size_t vertex_size, const unsigned char* last_vertex, int rot) +{ + for (size_t k = 0; k < 4; k += sizeof(T)) + { + size_t vertex_offset = k; + + T p = last_vertex[0]; + for (size_t j = 1; j < sizeof(T); ++j) + p |= last_vertex[j] << (8 * j); + + for (size_t i = 0; i < vertex_count; ++i) + { + T v = buffer[i]; + for (size_t j = 1; j < sizeof(T); ++j) + v |= buffer[i + vertex_count * j] << (8 * j); + + v = Xor ? T(rotate(v, rot)) ^ p : unzigzag(v) + p; + + for (size_t j = 0; j < sizeof(T); ++j) + transposed[vertex_offset + j] = (unsigned char)(v >> (j * 8)); + + p = v; + + vertex_offset += vertex_size; + } + + buffer += vertex_count * sizeof(T); + last_vertex += sizeof(T); + } +} + +static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); - unsigned char buffer[kVertexBlockMaxSize]; + unsigned char buffer[kVertexBlockMaxSize * 4]; unsigned char transposed[kVertexBlockSizeBytes]; size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); assert(vertex_count <= vertex_count_aligned); - for (size_t k = 0; k < vertex_size; ++k) + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + const unsigned char* control = data; + data += control_size; + + for (size_t k = 0; k < vertex_size; k += 4) { - data = decodeBytes(data, data_end, buffer, vertex_count_aligned); - if (!data) - return NULL; + unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; - size_t vertex_offset = k; - - unsigned char p = last_vertex[k]; - - for (size_t i = 0; i < vertex_count; ++i) + for (size_t j = 0; j < 4; ++j) { - unsigned char v = unzigzag8(buffer[i]) + p; + int ctrl = (ctrl_byte >> (j * 2)) & 3; - transposed[vertex_offset] = v; - p = v; + if (ctrl == 3) + { + // literal encoding + if (size_t(data_end - data) < vertex_count) + return NULL; - vertex_offset += vertex_size; + memcpy(buffer + j * vertex_count, data, vertex_count); + data += vertex_count; + } + else if (ctrl == 2) + { + // zero encoding + memset(buffer + j * vertex_count, 0, vertex_count); + } + else + { + data = decodeBytes(data, data_end, buffer + j * vertex_count, vertex_count_aligned, version == 0 ? kBitsV0 : kBitsV1 + ctrl); + if (!data) + return NULL; + } + } + + int channel = version == 0 ? 0 : channels[k / 4]; + + switch (channel & 3) + { + case 0: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); + break; + case 1: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, 0); + break; + case 2: + decodeDeltas1(buffer, transposed + k, vertex_count, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); + break; + default: + return NULL; // invalid channel type } } @@ -499,7 +805,7 @@ static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); #ifdef SIMD_SSE SIMD_TARGET -static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) +inline __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) { __m128i sm0 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask0])); __m128i sm1 = _mm_loadl_epi64(reinterpret_cast(&kDecodeBytesGroupShuffle[mask1])); @@ -511,11 +817,12 @@ static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) } SIMD_TARGET -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { __m128i result = _mm_setzero_si128(); @@ -525,6 +832,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { #ifdef __GNUC__ typedef int __attribute__((aligned(1))) unaligned_int; @@ -557,7 +865,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi unsigned char mask1 = (unsigned char)(mask16 >> 8); __m128i shuf = decodeShuffleMask(mask0, mask1); - __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -570,6 +877,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { #ifdef SIMD_LATENCYOPT unsigned long long data64; @@ -593,7 +901,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi unsigned char mask1 = (unsigned char)(mask16 >> 8); __m128i shuf = decodeShuffleMask(mask0, mask1); - __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); @@ -606,6 +913,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { __m128i result = _mm_loadu_si128(reinterpret_cast(data)); @@ -614,26 +922,46 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + __m128i rest = _mm_loadu_si128(reinterpret_cast(data + 2)); + + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + __m128i shuf = decodeShuffleMask(mask0, mask1); + __m128i result = _mm_shuffle_epi8(rest, shuf); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_AVX -static const __m128i decodeBytesGroupConfig[] = { - _mm_set1_epi8(3), - _mm_set1_epi8(15), - _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24), - _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56), +static const __m128i kDecodeBytesGroupConfig[8][2] = { + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, + {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_setzero_si128(), _mm_setzero_si128()}, + {_mm_set1_epi8(1), _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)}, + {_mm_set1_epi8(3), _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24)}, + {_mm_set1_epi8(15), _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56)}, }; -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +SIMD_TARGET +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { __m128i result = _mm_setzero_si128(); @@ -642,16 +970,19 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data; } - case 1: - case 2: + case 5: // 1-bit + case 1: // 2-bit + case 6: + case 2: // 4-bit + case 7: { - const unsigned char* skip = data + (bitslog2 << 2); + const unsigned char* skip = data + (2 << (hbits < 3 ? hbits : hbits - 5)); __m128i selb = _mm_loadl_epi64(reinterpret_cast(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast(skip)); - __m128i sent = decodeBytesGroupConfig[bitslog2 - 1]; - __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1]; + __m128i sent = kDecodeBytesGroupConfig[hbits][0]; + __m128i ctrl = kDecodeBytesGroupConfig[hbits][1]; __m128i selw = _mm_shuffle_epi32(selb, 0x44); __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw)); @@ -665,6 +996,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { __m128i result = _mm_loadu_si128(reinterpret_cast(data)); @@ -674,14 +1006,14 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_NEON -static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) +SIMD_TARGET +inline uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) { uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]); uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]); @@ -692,7 +1024,8 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8 return vcombine_u8(r0, r1); } -static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) +SIMD_TARGET +inline void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) { // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; @@ -703,11 +1036,13 @@ static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& m mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); } -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +SIMD_TARGET +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { uint8x16_t result = vdupq_n_u8(0); @@ -717,6 +1052,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { #ifdef SIMD_LATENCYOPT unsigned int data32; @@ -754,6 +1090,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { #ifdef SIMD_LATENCYOPT unsigned long long data64; @@ -788,6 +1125,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { uint8x16_t result = vld1q_u8(data); @@ -796,30 +1134,42 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + uint8x8_t rest0 = vld1_u8(data + 2); + uint8x8_t rest1 = vld1_u8(data + 2 + kDecodeBytesGroupCount[mask0]); + + uint8x16_t result = shuffleBytes(mask0, mask1, rest0, rest1); + + vst1q_u8(buffer, result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #ifdef SIMD_WASM SIMD_TARGET -static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) +inline v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) { v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); - v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]); - sm1off = wasm_i8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - + v128_t sm1off = wasm_v128_load8_splat(&kDecodeBytesGroupCount[mask0]); v128_t sm1r = wasm_i8x16_add(sm1, sm1off); return wasmx_unpacklo_v64x2(sm0, sm1r); } SIMD_TARGET -static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) +inline void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) { // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; @@ -829,11 +1179,12 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1 } SIMD_TARGET -static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +inline const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int hbits) { - switch (bitslog2) + switch (hbits) { case 0: + case 4: { v128_t result = wasm_i8x16_splat(0); @@ -843,6 +1194,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 1: + case 6: { v128_t sel2 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 4); @@ -857,7 +1209,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi wasmMoveMask(mask, mask0, mask1); v128_t shuf = decodeShuffleMask(mask0, mask1); - v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -866,6 +1217,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 2: + case 7: { v128_t sel4 = wasm_v128_load(data); v128_t rest = wasm_v128_load(data + 8); @@ -879,7 +1231,6 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi wasmMoveMask(mask, mask0, mask1); v128_t shuf = decodeShuffleMask(mask0, mask1); - v128_t result = wasm_v128_bitselect(wasm_i8x16_swizzle(rest, shuf), sel, mask); wasm_v128_store(buffer, result); @@ -888,6 +1239,7 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi } case 3: + case 8: { v128_t result = wasm_v128_load(data); @@ -896,16 +1248,30 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi return data + 16; } + case 5: + { + v128_t rest = wasm_v128_load(data + 2); + + unsigned char mask0 = data[0]; + unsigned char mask1 = data[1]; + + v128_t shuf = decodeShuffleMask(mask0, mask1); + v128_t result = wasm_i8x16_swizzle(rest, shuf); + + wasm_v128_store(buffer, result); + + return data + 2 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + default: - assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value - return data; + SIMD_UNREACHABLE(); // unreachable } } #endif #if defined(SIMD_SSE) || defined(SIMD_AVX) SIMD_TARGET -static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) +inline void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) { __m128i t0 = _mm_unpacklo_epi8(x0, x1); __m128i t1 = _mm_unpackhi_epi8(x0, x1); @@ -919,17 +1285,33 @@ static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) } SIMD_TARGET -static __m128i unzigzag8(__m128i v) +inline __m128i unzigzag8(__m128i v) { __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); return _mm_xor_si128(xl, xr); } + +SIMD_TARGET +inline __m128i unzigzag16(__m128i v) +{ + __m128i xl = _mm_sub_epi16(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi16(1))); + __m128i xr = _mm_srli_epi16(v, 1); + + return _mm_xor_si128(xl, xr); +} + +SIMD_TARGET +inline __m128i rotate32(__m128i v, int r) +{ + return _mm_or_si128(_mm_slli_epi32(v, r), _mm_srli_epi32(v, 32 - r)); +} #endif #ifdef SIMD_NEON -static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) +SIMD_TARGET +inline void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) { uint8x16x2_t t01 = vzipq_u8(x0, x1); uint8x16x2_t t23 = vzipq_u8(x2, x3); @@ -943,18 +1325,64 @@ static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_ x3 = vreinterpretq_u8_u16(x23.val[1]); } -static uint8x16_t unzigzag8(uint8x16_t v) +SIMD_TARGET +inline uint8x16_t unzigzag8(uint8x16_t v) { uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); uint8x16_t xr = vshrq_n_u8(v, 1); return veorq_u8(xl, xr); } + +SIMD_TARGET +inline uint8x16_t unzigzag16(uint8x16_t v) +{ + uint16x8_t vv = vreinterpretq_u16_u8(v); + uint8x16_t xl = vreinterpretq_u8_s16(vnegq_s16(vreinterpretq_s16_u16(vandq_u16(vv, vdupq_n_u16(1))))); + uint8x16_t xr = vreinterpretq_u8_u16(vshrq_n_u16(vv, 1)); + + return veorq_u8(xl, xr); +} + +SIMD_TARGET +inline uint8x16_t rotate32(uint8x16_t v, int r) +{ + uint32x4_t v32 = vreinterpretq_u32_u8(v); + return vreinterpretq_u8_u32(vorrq_u32(vshlq_u32(v32, vdupq_n_s32(r)), vshlq_u32(v32, vdupq_n_s32(r - 32)))); +} + +template +SIMD_TARGET inline uint8x8_t rebase(uint8x8_t npi, uint8x16_t r0, uint8x16_t r1, uint8x16_t r2, uint8x16_t r3) +{ + switch (Channel) + { + case 0: + { + uint8x16_t rsum = vaddq_u8(vaddq_u8(r0, r1), vaddq_u8(r2, r3)); + uint8x8_t rsumx = vadd_u8(vget_low_u8(rsum), vget_high_u8(rsum)); + return vadd_u8(vadd_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); + } + case 1: + { + uint16x8_t rsum = vaddq_u16(vaddq_u16(vreinterpretq_u16_u8(r0), vreinterpretq_u16_u8(r1)), vaddq_u16(vreinterpretq_u16_u8(r2), vreinterpretq_u16_u8(r3))); + uint16x4_t rsumx = vadd_u16(vget_low_u16(rsum), vget_high_u16(rsum)); + return vreinterpret_u8_u16(vadd_u16(vadd_u16(vreinterpret_u16_u8(npi), rsumx), vext_u16(rsumx, rsumx, 2))); + } + case 2: + { + uint8x16_t rsum = veorq_u8(veorq_u8(r0, r1), veorq_u8(r2, r3)); + uint8x8_t rsumx = veor_u8(vget_low_u8(rsum), vget_high_u8(rsum)); + return veor_u8(veor_u8(npi, rsumx), vext_u8(rsumx, rsumx, 4)); + } + default: + return npi; + } +} #endif #ifdef SIMD_WASM SIMD_TARGET -static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) +inline void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) { v128_t t0 = wasmx_unpacklo_v8x16(x0, x1); v128_t t1 = wasmx_unpackhi_v8x16(x0, x1); @@ -968,44 +1396,57 @@ static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) } SIMD_TARGET -static v128_t unzigzag8(v128_t v) +inline v128_t unzigzag8(v128_t v) { v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); v128_t xr = wasm_u8x16_shr(v, 1); return wasm_v128_xor(xl, xr); } + +SIMD_TARGET +inline v128_t unzigzag16(v128_t v) +{ + v128_t xl = wasm_i16x8_neg(wasm_v128_and(v, wasm_i16x8_splat(1))); + v128_t xr = wasm_u16x8_shr(v, 1); + + return wasm_v128_xor(xl, xr); +} + +SIMD_TARGET +inline v128_t rotate32(v128_t v, int r) +{ + return wasm_v128_or(wasm_i32x4_shl(v, r), wasm_i32x4_shr(v, 32 - r)); +} #endif #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) SIMD_TARGET -static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size, int hshift) { assert(buffer_size % kByteGroupSize == 0); assert(kByteGroupSize == 16); - const unsigned char* header = data; - // round number of groups to 4 to get number of header bytes size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; - if (size_t(data_end - data) < header_size) return NULL; + const unsigned char* header = data; data += header_size; size_t i = 0; - // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b + // fast-path: process 4 groups at a time, do a shared bounds check for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) { size_t header_offset = i / kByteGroupSize; unsigned char header_byte = header[header_offset / 4]; - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3); - data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, hshift + ((header_byte >> 0) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, hshift + ((header_byte >> 2) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, hshift + ((header_byte >> 4) & 3)); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, hshift + ((header_byte >> 6) & 3)); } // slow-path: process remaining groups @@ -1015,17 +1456,102 @@ static const unsigned char* decodeBytesSimd(const unsigned char* data, const uns return NULL; size_t header_offset = i / kByteGroupSize; + unsigned char header_byte = header[header_offset / 4]; - int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; - - data = decodeBytesGroupSimd(data, buffer + i, bitslog2); + data = decodeBytesGroupSimd(data, buffer + i, hshift + ((header_byte >> ((header_offset % 4) * 2)) & 3)); } return data; } +template +SIMD_TARGET static void +decodeDeltas4Simd(const unsigned char* buffer, unsigned char* transposed, size_t vertex_count_aligned, size_t vertex_size, unsigned char last_vertex[4], int rot) +{ +#if defined(SIMD_SSE) || defined(SIMD_AVX) +#define TEMP __m128i +#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex)) +#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) +#define GRP4(i) t0 = r##i, t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) +#define FIXD(i) t##i = pi = Channel == 0 ? _mm_add_epi8(pi, t##i) : (Channel == 1 ? _mm_add_epi16(pi, t##i) : _mm_xor_si128(pi, t##i)) +#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size +#endif + +#ifdef SIMD_NEON +#define TEMP uint8x8_t +#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex), vdup_n_u32(0), 0)) +#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) +#define FIXD(i) t##i = pi = Channel == 0 ? vadd_u8(pi, t##i) : (Channel == 1 ? vreinterpret_u8_u16(vadd_u16(vreinterpret_u16_u8(pi), vreinterpret_u16_u8(t##i))) : veor_u8(pi, t##i)) +#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size +#endif + +#ifdef SIMD_WASM +#define TEMP v128_t +#define PREP() v128_t pi = wasm_v128_load(last_vertex) +#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = r##i, t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) +#define FIXD(i) t##i = pi = Channel == 0 ? wasm_i8x16_add(pi, t##i) : (Channel == 1 ? wasm_i16x8_add(pi, t##i) : wasm_v128_xor(pi, t##i)) +#define SAVE(i) wasm_v128_store32_lane(savep, t##i, 0), savep += vertex_size +#endif + +#define UNZR(i) r##i = Channel == 0 ? unzigzag8(r##i) : (Channel == 1 ? unzigzag16(r##i) : rotate32(r##i, rot)) + + PREP(); + + unsigned char* savep = transposed; + + for (size_t j = 0; j < vertex_count_aligned; j += 16) + { + LOAD(0); + LOAD(1); + LOAD(2); + LOAD(3); + + transpose8(r0, r1, r2, r3); + + TEMP t0, t1, t2, t3; + TEMP npi = pi; + + UNZR(0); + GRP4(0); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(1); + GRP4(1); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(2); + GRP4(2); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + UNZR(3); + GRP4(3); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + +#if defined(SIMD_LATENCYOPT) && defined(SIMD_NEON) && (defined(__APPLE__) || defined(_WIN32)) + // instead of relying on accumulated pi, recompute it from scratch from r0..r3; this shortens dependency between loop iterations + pi = rebase(npi, r0, r1, r2, r3); +#else + (void)npi; +#endif + +#undef UNZR +#undef TEMP +#undef PREP +#undef LOAD +#undef GRP4 +#undef FIXD +#undef SAVE + } +} + SIMD_TARGET -static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256], const unsigned char* channels, int version) { assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); @@ -1034,84 +1560,61 @@ static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, con size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + size_t control_size = version == 0 ? 0 : vertex_size / 4; + if (size_t(data_end - data) < control_size) + return NULL; + + const unsigned char* control = data; + data += control_size; + for (size_t k = 0; k < vertex_size; k += 4) { + unsigned char ctrl_byte = version == 0 ? 0 : control[k / 4]; + for (size_t j = 0; j < 4; ++j) { - data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned); - if (!data) - return NULL; + int ctrl = (ctrl_byte >> (j * 2)) & 3; + + if (ctrl == 3) + { + // literal encoding; safe to over-copy due to tail + if (size_t(data_end - data) < vertex_count_aligned) + return NULL; + + memcpy(buffer + j * vertex_count_aligned, data, vertex_count_aligned); + data += vertex_count; + } + else if (ctrl == 2) + { + // zero encoding + memset(buffer + j * vertex_count_aligned, 0, vertex_count_aligned); + } + else + { + // for v0, headers are mapped to 0..3; for v1, headers are mapped to 4..8 + int hshift = version == 0 ? 0 : 4 + ctrl; + + data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned, hshift); + if (!data) + return NULL; + } } -#if defined(SIMD_SSE) || defined(SIMD_AVX) -#define TEMP __m128i -#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast(last_vertex + k)) -#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast(buffer + j + i * vertex_count_aligned)) -#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) -#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i) -#define SAVE(i) *reinterpret_cast(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size -#endif + int channel = version == 0 ? 0 : channels[k / 4]; -#ifdef SIMD_NEON -#define TEMP uint8x8_t -#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast(last_vertex + k), vdup_n_u32(0), 0)) -#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) -#define FIXD(i) t##i = pi = vadd_u8(pi, t##i) -#define SAVE(i) vst1_lane_u32(reinterpret_cast(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size -#endif - -#ifdef SIMD_WASM -#define TEMP v128_t -#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) -#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) -#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) -#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) -#define SAVE(i) *reinterpret_cast(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size -#endif - - PREP(); - - unsigned char* savep = transposed + k; - - for (size_t j = 0; j < vertex_count_aligned; j += 16) + switch (channel & 3) { - LOAD(0); - LOAD(1); - LOAD(2); - LOAD(3); - - r0 = unzigzag8(r0); - r1 = unzigzag8(r1); - r2 = unzigzag8(r2); - r3 = unzigzag8(r3); - - transpose8(r0, r1, r2, r3); - - TEMP t0, t1, t2, t3; - - GRP4(0); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(1); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(2); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - - GRP4(3); - FIXD(0), FIXD(1), FIXD(2), FIXD(3); - SAVE(0), SAVE(1), SAVE(2), SAVE(3); - -#undef TEMP -#undef PREP -#undef LOAD -#undef GRP4 -#undef FIXD -#undef SAVE + case 0: + decodeDeltas4Simd<0>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); + break; + case 1: + decodeDeltas4Simd<1>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, 0); + break; + case 2: + decodeDeltas4Simd<2>(buffer, transposed + k, vertex_count_aligned, vertex_size, last_vertex + k, (32 - (channel >> 4)) & 31); + break; + default: + return NULL; // invalid channel type } } @@ -1140,12 +1643,13 @@ static unsigned int cpuid = getCpuFeatures(); } // namespace meshopt -size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +size_t meshopt_encodeVertexBufferLevel(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size, int level) { using namespace meshopt; assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); + assert(level >= 0 && level <= 9); // only a subset of this range is used right now #if TRACE memset(vertexstats, 0, sizeof(vertexstats)); @@ -1156,7 +1660,7 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con unsigned char* data = buffer; unsigned char* data_end = buffer + buffer_size; - if (size_t(data_end - data) < 1 + vertex_size) + if (size_t(data_end - data) < 1) return 0; int version = gEncodeVertexVersion; @@ -1172,34 +1676,52 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con size_t vertex_block_size = getVertexBlockSize(vertex_size); + unsigned char channels[64] = {}; + if (version != 0 && level > 1 && vertex_count > 1) + for (size_t k = 0; k < vertex_size; k += 4) + { + int rot = level >= 3 ? estimateRotate(vertex_data, vertex_count, vertex_size, k, /* group_size= */ 16) : 0; + int channel = estimateChannel(vertex_data, vertex_count, vertex_size, k, vertex_block_size, /* block_skip= */ 3, /* max_channels= */ level >= 3 ? 3 : 2, rot); + + assert(unsigned(channel) < 2 || ((channel & 3) == 2 && unsigned(channel >> 4) < 8)); + channels[k / 4] = (unsigned char)channel; + } + size_t vertex_offset = 0; while (vertex_offset < vertex_count) { size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; - data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version, level); if (!data) return 0; vertex_offset += block_size; } - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); + size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; - if (size_t(data_end - data) < tail_size) + if (size_t(data_end - data) < tail_size_pad) return 0; - // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder - if (vertex_size < kTailMaxSize) + if (tail_size < tail_size_pad) { - memset(data, 0, kTailMaxSize - vertex_size); - data += kTailMaxSize - vertex_size; + memset(data, 0, tail_size_pad - tail_size); + data += tail_size_pad - tail_size; } memcpy(data, first_vertex, vertex_size); data += vertex_size; + if (version != 0) + { + memcpy(data, channels, vertex_size / 4); + data += vertex_size / 4; + } + assert(data >= buffer + tail_size); assert(data <= buffer + buffer_size); @@ -1212,17 +1734,40 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con printf("%2d: %7d bytes [%4.1f%%] %.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); - size_t total_k = vsk.header + vsk.bitg[0] + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[3]; + size_t total_k = vsk.header + vsk.bitg[1] + vsk.bitg[2] + vsk.bitg[4] + vsk.bitg[8]; + double total_kr = total_k ? 1.0 / double(total_k) : 0; - printf(" |\thdr [%5.1f%%] bitg 1-3 [%4.1f%% %4.1f%% %4.1f%%]", - double(vsk.header) / double(total_k) * 100, double(vsk.bitg[1]) / double(total_k) * 100, - double(vsk.bitg[2]) / double(total_k) * 100, double(vsk.bitg[3]) / double(total_k) * 100); + if (version != 0) + { + int channel = channels[k / 4]; + + if ((channel & 3) == 2 && k % 4 == 0) + printf(" | ^%d", channel >> 4); + else + printf(" | %2s", channel == 0 ? "1" : (channel == 1 && k % 2 == 0 ? "2" : ".")); + } + + printf(" | hdr [%5.1f%%] bitg [1 %4.1f%% 2 %4.1f%% 4 %4.1f%% 8 %4.1f%%]", + double(vsk.header) * total_kr * 100, + double(vsk.bitg[1]) * total_kr * 100, double(vsk.bitg[2]) * total_kr * 100, + double(vsk.bitg[4]) * total_kr * 100, double(vsk.bitg[8]) * total_kr * 100); + + size_t total_ctrl = vsk.ctrl[0] + vsk.ctrl[1] + vsk.ctrl[2] + vsk.ctrl[3]; + + if (total_ctrl) + { + printf(" | ctrl %3.0f%% %3.0f%% %3.0f%% %3.0f%%", + double(vsk.ctrl[0]) / double(total_ctrl) * 100, double(vsk.ctrl[1]) / double(total_ctrl) * 100, + double(vsk.ctrl[2]) / double(total_ctrl) * 100, double(vsk.ctrl[3]) / double(total_ctrl) * 100); + } + + if (level >= 3) + printf(" | bitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", + double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, + double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, + double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, + double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); - printf(" |\tbitc [%3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%% %3.0f%%]", - double(vsk.bitc[0]) / double(vertex_count) * 100, double(vsk.bitc[1]) / double(vertex_count) * 100, - double(vsk.bitc[2]) / double(vertex_count) * 100, double(vsk.bitc[3]) / double(vertex_count) * 100, - double(vsk.bitc[4]) / double(vertex_count) * 100, double(vsk.bitc[5]) / double(vertex_count) * 100, - double(vsk.bitc[6]) / double(vertex_count) * 100, double(vsk.bitc[7]) / double(vertex_count) * 100); printf("\n"); } #endif @@ -1230,6 +1775,11 @@ size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, con return data - buffer; } +size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + return meshopt_encodeVertexBufferLevel(buffer, buffer_size, vertices, vertex_count, vertex_size, meshopt::kEncodeDefaultLevel); +} + size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) { using namespace meshopt; @@ -1240,21 +1790,42 @@ size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) size_t vertex_block_size = getVertexBlockSize(vertex_size); size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; + size_t vertex_block_control_size = vertex_size / 4; size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; size_t vertex_block_data_size = vertex_block_size; - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + size_t tail_size = vertex_size + (vertex_size / 4); + size_t tail_size_min = kTailMinSizeV0 > kTailMinSizeV1 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; + assert(tail_size_pad >= kByteGroupDecodeLimit); - return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size; + return 1 + vertex_block_count * vertex_size * (vertex_block_control_size + vertex_block_header_size + vertex_block_data_size) + tail_size_pad; } void meshopt_encodeVertexVersion(int version) { - assert(unsigned(version) <= 0); + assert(unsigned(version) <= unsigned(meshopt::kDecodeVertexVersion)); meshopt::gEncodeVertexVersion = version; } +int meshopt_decodeVertexVersion(const unsigned char* buffer, size_t buffer_size) +{ + if (buffer_size < 1) + return -1; + + unsigned char header = buffer[0]; + + if ((header & 0xf0) != meshopt::kVertexHeader) + return -1; + + int version = header & 0x0f; + if (version > meshopt::kDecodeVertexVersion) + return -1; + + return version; +} + int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size) { using namespace meshopt; @@ -1262,7 +1833,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve assert(vertex_size > 0 && vertex_size <= 256); assert(vertex_size % 4 == 0); - const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = NULL; + const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256], const unsigned char*, int) = NULL; #if defined(SIMD_SSE) && defined(SIMD_FALLBACK) decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; @@ -1282,7 +1853,7 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve const unsigned char* data = buffer; const unsigned char* data_end = buffer + buffer_size; - if (size_t(data_end - data) < 1 + vertex_size) + if (size_t(data_end - data) < 1) return -2; unsigned char data_header = *data++; @@ -1291,11 +1862,22 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve return -1; int version = data_header & 0x0f; - if (version > 0) + if (version > kDecodeVertexVersion) return -1; + size_t tail_size = vertex_size + (version == 0 ? 0 : vertex_size / 4); + size_t tail_size_min = version == 0 ? kTailMinSizeV0 : kTailMinSizeV1; + size_t tail_size_pad = tail_size < tail_size_min ? tail_size_min : tail_size; + + if (size_t(data_end - data) < tail_size_pad) + return -2; + + const unsigned char* tail = data_end - tail_size; + unsigned char last_vertex[256]; - memcpy(last_vertex, data_end - vertex_size, vertex_size); + memcpy(last_vertex, tail, vertex_size); + + const unsigned char* channels = version == 0 ? NULL : tail + vertex_size; size_t vertex_block_size = getVertexBlockSize(vertex_size); @@ -1305,16 +1887,14 @@ int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t ve { size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; - data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex, channels, version); if (!data) return -2; vertex_offset += block_size; } - size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; - - if (size_t(data_end - data) != tail_size) + if (size_t(data_end - data) != tail_size_pad) return -3; return 0;