@steve,
I also tried importing the .obj file as 49 separate meshes but the performance is the same; slow to display.
I saved the 1 mesh of 100M faces case to a .3dm file. A link to it is attached below if you want to use it as a test case for your Visual Studio analysis.
If I bring up Rhino and open this file it does not take too long to load. But then if I turn on the layer for the mesh, it takes 80 sec for it to appear using my Nvidia RTX 3080 ti GPU.
Anything improvements you can make will be greatly appreciated by many. Many of us now have multicore processors which I detect using this code in my C++ DLL:
//
// Procedures to find number of cores, logical cores and more.
//
#include <malloc.h>
typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
// Helper function to count set bits in the processor mask.
DWORD CountSetBits(ULONG_PTR bitMask) {
DWORD LSHIFT = sizeof(ULONG_PTR) * 8 - 1;
DWORD bitSetCount = 0;
ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
DWORD i;
for (i = 0; i <= LSHIFT; ++i) { bitSetCount += ((bitMask & bitTest) ? 1 : 0); bitTest /= 2; }
return bitSetCount;
}
DLLEXPORT int _cdecl get_core_count() {
LPFN_GLPI glpi;
BOOL done = FALSE;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
DWORD returnLength = 0;
DWORD logicalProcessorCount = 0;
DWORD numaNodeCount = 0;
DWORD processorCoreCount = 0;
DWORD processorL1CacheCount = 0;
DWORD processorL2CacheCount = 0;
DWORD processorL3CacheCount = 0;
DWORD processorPackageCount = 0;
DWORD byteOffset = 0;
PCACHE_DESCRIPTOR Cache;
glpi = (LPFN_GLPI)GetProcAddress( GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
if (NULL == glpi) { RhinoApp().Print("\nGetLogicalProcessorInformation is not supported. Core count set to 8.\n"); return (8); }
while (!done) {
DWORD rc = glpi(buffer, &returnLength);
if (FALSE == rc) {
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
if (buffer) free(buffer);
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
if (NULL == buffer) { RhinoApp().Print("\nError: Allocation failure. Core count set to 8.\n"); return (8); }
}
else { RhinoApp().Print("\nERROR %d. Core count set to 8.\n", GetLastError()); return (8); }
}
else { done = TRUE; }
}
ptr = buffer;
while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) {
switch (ptr->Relationship) {
case RelationNumaNode:
// Non-NUMA systems report a single record of this type.
numaNodeCount++;
break;
case RelationProcessorCore:
processorCoreCount++;
// A hyperthreaded core supplies more than one logical processor.
logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
break;
case RelationCache:
// Cache data is in ptr->Cache, one CACHE_DESCRIPTOR structure for each cache.
Cache = &ptr->Cache;
if (Cache->Level == 1) { processorL1CacheCount++; }
else if (Cache->Level == 2) { processorL2CacheCount++; }
else if (Cache->Level == 3) { processorL3CacheCount++; }
break;
case RelationProcessorPackage:
// Logical processors share a physical package.
processorPackageCount++;
break;
default:
RhinoApp().Print("\nError: Unsupported LOGICAL_PROCESSOR_RELATIONSHIP value.\n");
break;
}
byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
/*
RhinoApp().Print("\nGetLogicalProcessorInformation results:\n");
RhinoApp().Print("Number of NUMA nodes: %d\n",
numaNodeCount);
RhinoApp().Print("Number of physical processor packages: %d\n",
processorPackageCount);
RhinoApp().Print("Number of processor cores : % d\n",
processorCoreCount);
RhinoApp().Print("Number of logical processors: %d\n",
logicalProcessorCount);
RhinoApp().Print("Number of processor L1/L2/L3 caches: %d/%d/%d\n", processorL1CacheCount, processorL2CacheCount, processorL3CacheCount);
*/
free(buffer);
// 10% faster for importing multiple serial read meshes.
return logicalProcessorCount;
// Not as fast.
//return processorCoreCount;
}
Here is an example where I called it from my C++ code to setup the number of parallel reads of the .obj file when sampling it to find the start/stop locations of vertices, textures, normals and faces in the file.
// Define lists for storing location of vertices, textures, normals and faces in .obj file.
int32_t npar = get_core_count();
auto vmin = make_unique<uint64_t[]>(npar); auto vmax = make_unique<uint64_t[]>(npar); auto vtmin = make_unique<uint64_t[]>(npar); auto vtmax = make_unique<uint64_t[]>(npar);
auto vnmin = make_unique<uint64_t[]>(npar); auto vnmax = make_unique<uint64_t[]>(npar); auto fmin = make_unique<uint64_t[]>(npar); auto fmax = make_unique<uint64_t[]>(npar);
My code then uses binary-chop searches to find the exact start/stop locations of the mesh elements and then reads and parses them in parallel (24-way in my CPU’s case). This way I am able to read the 100MB of data for each of the 2M faces mesh in 30 ms or so. My wife always tells me to work fuzzy to focused if I want to get things done so I tried to follow her paradigm in developing this code. At first it only has an approximate idea of where the data is in the .obj file but it only takes 100 ms to gain this knowledge. Then if finds the exact locations which is very fast, a few ms, because it only has to search in very small regions of the 4.7 GB big file. Finally it reads the data and parses it in parallel since it knows the exact locations. The alternative is to just start reading the file, parsing every line and storing the data but this is 100X slower. This is the best trick I found for quickly reading the .obj file. Perhaps there are similar tricks that can be discovered for accelerating the display of my big meshes.
Regards,
Terry.