Various fixes, optimizations, and updates.

- Drastically reduce metadata duplication for subset builds - Properly handle OriginId dimension, as well as its stats for subset builds (#277) - Properly compute detailed stats for subset builds rather than disabling them - Allow EPT to be used as input rather than treating it as an "info" result (#291) - Handle large amounts of duplicate points which overflow the tree (#250) - Update arbiter bundle - Update docs removing outdated info
connormanning · Aug 24, 2023 · 7497aa4 · 7497aa4
1 parent 8bd179c
commit 7497aa4
Show file tree

Hide file tree

Showing 17 changed files with 811 additions and 576 deletions.
diff --git a/app/build.cpp b/app/build.cpp
@@ -183,6 +183,15 @@ void Build::addArgs()
             "logging (default: 10).",
             [this](json j) { m_json["laz_14"] = extract(j); });
 
+    m_ap.add(
+            "--noSchemaStats",
+            "Skip detailed dimension statistics (for subset builds only).",
+            [this](json j)
+            {
+                checkEmpty(j);
+                m_json["withSchemaStats"] = false;
+            });
+
     addArbiter();
 }
 

diff --git a/app/info.cpp b/app/info.cpp
@@ -57,7 +57,8 @@ void Info::addArgs()
 
 void Info::run()
 {
-    const arbiter::Arbiter a = config::getArbiter(m_json);
+    const std::unique_ptr<arbiter::Arbiter> a = 
+        config::getArbiter(m_json.dump());
     StringList inputs = config::getInput(m_json);
     if (inputs.empty())
     {
@@ -67,7 +68,7 @@ void Info::run()
     if (std::any_of(inputs.begin(), inputs.end(), isDirectory))
     {
         std::cout << "Resolving inputs..." << std::endl;
-        inputs = resolve(inputs, a);
+        inputs = resolve(inputs, *a);
         std::cout << "\tResolved." << std::endl;
     }
 
@@ -97,7 +98,7 @@ void Info::run()
         pipeline,
         deep,
         tmp,
-        a,
+        *a,
         threads);
     const SourceInfo summary = manifest::reduce(sources);
 
@@ -122,15 +123,15 @@ void Info::run()
     {
         std::cout << "Saving output..." << std::endl;
         const bool pretty = sources.size() <= 1000;
-        const auto endpoint = a.getEndpoint(output);
+        const auto endpoint = a->getEndpoint(output);
         saveMany(sources, endpoint, threads, pretty);
         std::cout << "\tSaved." << std::endl;
     }
 
     if (summaryFilename.size())
     {
         std::cout << "Saving summary..." << std::endl;
-        a.put(summaryFilename, json(summary).dump(2));
+        a->put(summaryFilename, json(summary).dump(2));
     }
 }
 

diff --git a/doc/source/configuration.md b/doc/source/configuration.md
@@ -7,7 +7,6 @@ Entwine provides 4 sub-commands for indexing point cloud data:
 | [build](#build)     | Generate an EPT dataset from point cloud data           |
 | [info](#info)       | Gather information about point clouds before building   |
 | [merge](#merge)     | Merge datasets build as subsets                         |
-| [convert](#convert) | Convert an EPT dataset to a different format            |
 
 These commands are invoked via the command line as:
 
@@ -384,64 +383,6 @@ where `n` is the `of` value from the subset specification.
 
 
 
-## Convert
-
-The `convert` command provides utilities to transform Entwine Point Tile output
-into other formats.  Currently the only conversion provided is to the
-[Cesium 3D Tiles](https://github.com/AnalyticalGraphicsInc/3d-tiles) format.
-For proper positioning, data must be reprojected to `EPSG:4978` during the
-`entwine build` step.
-
-| Key | Description |
-|-----|-------------|
-| [input](#input-convert) | Directory containing a completed Entwine build |
-| [output](#output-convert) | Output directory for the converted dataset |
-| [tmp](#tmp) | Temporary directory |
-| [threads](#threads) | Number of parallel threads |
-| [colorType](#colorType) | Color selection for output tileset |
-| [truncate](#truncate) | Truncate color values to one byte |
-| [geometricErrorDivisor](#geometricerrordivisor) | Geometric error divisor |
-
-### input (convert)
-
-The `input` to a `convert` command is the path of a directory containing a
-completed Entwine build.
-
-### output (convert)
-
-Output directory in which to write the converted dataset.
-
-### colorType
-
-An optional setting to select a coloring method for the output.  If omitted,
-RGB will be used if they exist in the input, or Intensity if it exists and RGB
-does not exist.  If neither exist and no `colorType` is provided, then the
-output tileset will not contain color.
-
-If set, valid values to color the RGB in the output are:
-
-| Value | Description |
-|-------|-------------|
-| `none` | RGB is omitted |
-| `rgb` | Color values from the input |
-| `intensity` | Grayscale intensity values |
-| `tile` | Each tile is randomly colored |
-
-### truncate
-
-Cesium accepts one-byte color values, but many formats allow two-byte storage
-of intensities and RGB values.  If the input data contains values to be colored
-as RGB values that are greater than `255`, then they may be scaled down to one
-byte with the `--truncate` flag.
-
-### geometricErrorDivisor
-
-The root geometric error is determined as `cubeWidth / geometricErrorDivisor`,
-which defaults to `32.0`.  Lower values will cause Cesium to render the data
-more densely.
-```json
-{ "geometricErrorDivisor": 16.0 }
-```
 
 
 

diff --git a/entwine/builder/builder.cpp b/entwine/builder/builder.cpp
@@ -12,6 +12,8 @@
 
 #include <algorithm>
 #include <cassert>
+#include <iostream>
+#include <limits>
 
 #include <pdal/PipelineManager.hpp>
 
@@ -29,6 +31,18 @@
 
 namespace entwine
 {
+namespace
+{
+
+std::string toFullPrecisionString(double d) {
+    std::ostringstream os;
+    os << 
+        std::fixed << 
+        std::setprecision(std::numeric_limits<double>::max_digits10) << 
+        d;
+    return os.str();
+}
+}
 
 Builder::Builder(
     Endpoints endpoints,
@@ -211,17 +225,23 @@ void Builder::insert(
         ? getBounds(metadata.bounds, *metadata.subset)
         : optional<Bounds>();
 
-    uint64_t inserted(0);
+    uint64_t insertedSinceLastSleep(0);
     uint64_t pointId(0);
 
+    // We have our metadata point count - but now we'll count the points that
+    // are actually inserted.  If the file's header metadata was inaccurate, or
+    // an overabundance of duplicate points causes some to be discarded, then we
+    // won't count them.
+    info.points = 0;
+
     auto layout = toLayout(metadata.absoluteSchema);
     VectorPointTable table(layout);
     table.setProcess([&]()
     {
-        inserted += table.numPoints();
-        if (inserted > heuristics::sleepCount)
+        insertedSinceLastSleep += table.numPoints();
+        if (insertedSinceLastSleep > heuristics::sleepCount)
         {
-            inserted = 0;
+            insertedSinceLastSleep = 0;
             clipper.clip();
         }
 
@@ -248,11 +268,11 @@ void Builder::insert(
                 if (!boundsSubset || boundsSubset->contains(point))
                 {
                     key.init(point);
-                    cache.insert(voxel, key, ck, clipper);
-                    ++counts.inserts;
+                    if (cache.insert(voxel, key, ck, clipper)) ++counts.inserts;
                 }
             }
         }
+        info.points += counts.inserts;
         counter += counts.inserts;
     });
 
@@ -261,7 +281,14 @@ void Builder::insert(
         : info.pipeline;
     pipeline.at(0)["filename"] = localPath;
 
-    // TODO: Allow this to be set via config.
+    if (contains(metadata.schema, "OriginId"))
+    {
+        pipeline.push_back({
+            { "type", "filters.assign" },
+            { "value", "OriginId = " + std::to_string(originId) }
+        });
+    }
+
     const bool needsStats = !hasStats(info.schema);
     if (needsStats)
     {
@@ -270,6 +297,22 @@ void Builder::insert(
         {
             statsFilter.update({ { "enumerate", "Classification" } });
         }
+
+        // Only accumulate stats for points that actually get inserted.
+        const Bounds b = boundsSubset 
+            ? *boundsSubset 
+            : metadata.boundsConforming;
+
+        const auto& min = b.min();
+        const auto& max = b.max();
+
+        const std::string where = 
+            "X >= " + toFullPrecisionString(min.x) + " && " + 
+            "X < " + toFullPrecisionString(max.x) + " && " +
+            "Y >= " + toFullPrecisionString(min.y) + " && " + 
+            "Y < " + toFullPrecisionString(max.y);
+
+        statsFilter.update({ { "where", where } });
     }
 
     pdal::PipelineManager pm;
@@ -286,17 +329,25 @@ void Builder::insert(
 
     last.execute(table);
 
-    // TODO:
-    // - update point count information for this file's metadata.
     if (pdal::Stage* stage = findStage(last, "filters.stats"))
     {
         const pdal::StatsFilter& statsFilter(
             dynamic_cast<const pdal::StatsFilter&>(*stage));
 
+        // Our source file metadata might not have an origin id since we add
+        // that dimension.  In that case, add it to the source file's schema so
+        // it ends up being included in the stats.
+        if (contains(metadata.schema, "OriginId") && 
+            !contains(info.schema, "OriginId"))
+        {
+            info.schema.emplace_back("OriginId", Type::Unsigned32);
+        }
+
         for (Dimension& d : info.schema)
         {
             const DimId id = layout.findDim(d.name);
             d.stats = DimensionStats(statsFilter.getStats(id));
+            d.stats->count = info.points;
         }
     }
 }
@@ -345,11 +396,43 @@ void Builder::saveSources(const unsigned threads)
     {
         // If we are a subset, write the whole detailed metadata as one giant
         // blob, since we know we're going to need to wake up the whole thing to
-        // do the merge.
-        ensurePut(
-            endpoints.sources,
-            manifestFilename,
-            json(manifest).dump(getIndent(pretty)));
+        // do the merge.  In this case, aside from the schema which contains
+        // detailed dimension stats for the subset, each corresponding item is 
+        // identical per subset: so in that case we will only write the path and
+        // schema.
+        if (metadata.subset->id != 1)
+        {
+            json list = json::array();
+            for (auto& item : manifest)
+            {
+                list.push_back({
+                    { "path", item.source.path },
+                    { "inserted", item.inserted }
+                });
+
+                const auto& info = item.source.info;
+                auto& j = list.back();
+
+                if (item.inserted)
+                {
+                    j.update({ { "points", info.points } });
+
+                    if (info.points) j.update({ { "schema", info.schema } });
+                }
+            }
+
+            ensurePut(
+                endpoints.sources,
+                manifestFilename,
+                list.dump(getIndent(pretty)));
+        }
+        else
+        {
+            ensurePut(
+                endpoints.sources,
+                manifestFilename,
+                json(manifest).dump(getIndent(pretty)));
+        }
     }
     else
     {
@@ -562,10 +645,15 @@ void merge(
                     verbose);
                 builder::mergeOne(builder, current, cache);
 
-                std::lock_guard<std::mutex> lock(mutex);
-                builder.manifest = manifest::merge(
-                    builder.manifest,
-                    current.manifest);
+                // Our base builder contains the manifest of subset 1 so we
+                // don't need to merge that one.
+                if (id > 1)
+                {
+                    std::lock_guard<std::mutex> lock(mutex);
+                    builder.manifest = manifest::merge(
+                        builder.manifest,
+                        current.manifest);
+                }
             });
         }
         else if (verbose) std::cout << "skipping" << std::endl;

diff --git a/entwine/builder/chunk-cache.cpp b/entwine/builder/chunk-cache.cpp
@@ -61,13 +61,15 @@ void ChunkCache::join()
             }));
 }
 
-void ChunkCache::insert(
+bool ChunkCache::insert(
         Voxel& voxel,
         Key& key,
         const ChunkKey& ck,
         Clipper& clipper)
 {
-    assert(ck.depth() < maxDepth);
+    // This point is likely one of several thousand points with exactly
+    // duplicated XYZ values - discard it.
+    if (ck.depth() >= maxDepth) return false;
 
     // Get from single-threaded cache if we can.
     Chunk* chunk = clipper.get(ck);
@@ -76,12 +78,12 @@ void ChunkCache::insert(
     if (!chunk) chunk = &addRef(ck, clipper);
 
     // Try to insert the point into this chunk.
-    if (chunk->insert(*this, clipper, voxel, key)) return;
+    if (chunk->insert(*this, clipper, voxel, key)) return true;
 
     // Failed to insert - need to traverse to the next depth.
     key.step(voxel.point());
     const Dir dir(getDirection(ck.bounds().mid(), voxel.point()));
-    insert(voxel, key, chunk->childAt(dir), clipper);
+    return insert(voxel, key, chunk->childAt(dir), clipper);
 }
 
 Chunk& ChunkCache::addRef(const ChunkKey& ck, Clipper& clipper)

diff --git a/entwine/builder/chunk-cache.hpp b/entwine/builder/chunk-cache.hpp
@@ -71,7 +71,7 @@ class ChunkCache
 
     ~ChunkCache();
 
-    void insert(Voxel& voxel, Key& key, const ChunkKey& ck, Clipper& clipper);
+    bool insert(Voxel& voxel, Key& key, const ChunkKey& ck, Clipper& clipper);
     void clip(uint64_t depth, const std::map<Xyz, Chunk*>& stale);
     void clipped() { maybePurge(m_cacheSize); }
     void join();