Changed to use lfpAlloc from ltalloc for experimental multi-threaded .obj parser since ltalloc is not a porable(e.g. it does not support ARM archtecture).

This commit is contained in:
Syoyo Fujita
2017-07-05 16:32:40 +09:00
parent 99518b6d3e
commit 95fba2ab32
12 changed files with 402 additions and 1064 deletions

View File

@@ -1,5 +1,16 @@
Experimental code for .obj loader. # Experimental code for .obj loader.
* Multi-threaded optimized parser : tinyobj_loader_opt.h * Multi-threaded optimized parser : tinyobj_loader_opt.h
## Requirements
* C++-11 compiler
## Compile options
* zstd compressed .obj support. `--with-zstd` premake option. * zstd compressed .obj support. `--with-zstd` premake option.
* gzip compressed .obj support. `--with-zlib` premake option. * gzip compressed .obj support. `--with-zlib` premake option.
## Licenses
* lfpAlloc : MIT license.

View File

@@ -0,0 +1,89 @@
#ifndef LF_POOL_ALLOCATOR
#define LF_POOL_ALLOCATOR
#include <memory>
#include <thread>
#include <lfpAlloc/PoolDispatcher.hpp>
namespace lfpAlloc {
template <typename T, std::size_t NumPools = 70>
class lfpAllocator {
public:
using value_type = T;
using pointer = T*;
using const_pointer = const T*;
using reference = T&;
using const_reference = T const&;
template <typename U>
struct rebind {
typedef lfpAllocator<U, NumPools> other;
};
lfpAllocator() {}
template <typename U>
lfpAllocator(lfpAllocator<U, NumPools>&&) noexcept {}
template <typename U>
lfpAllocator(const lfpAllocator<U, NumPools>&) noexcept {}
T* allocate(std::size_t count) {
if (sizeof(T) * count <=
alignof(std::max_align_t) * NumPools - sizeof(void*)) {
return reinterpret_cast<T*>(
dispatcher_.allocate(sizeof(T) * count));
} else {
return new T[count];
}
}
void deallocate(T* p, std::size_t count) noexcept {
if (sizeof(T) * count <=
alignof(std::max_align_t) * NumPools - sizeof(void*)) {
dispatcher_.deallocate(p, sizeof(T) * count);
} else {
delete[] p;
}
}
// Should not be required, but allocator_traits is not complete in
// gcc 4.9.1
template <typename U>
void destroy(U* p) {
p->~U();
}
template <typename U, typename... Args>
void construct(U* p, Args&&... args) {
new (p) U(std::forward<Args>(args)...);
}
template <typename Ty, typename U, std::size_t N, std::size_t M>
friend bool operator==(const lfpAllocator<Ty, N>&,
const lfpAllocator<U, M>&) noexcept;
template <typename U, std::size_t M>
friend class lfpAllocator;
private:
static PoolDispatcher<NumPools> dispatcher_;
};
template <typename T, std::size_t N>
PoolDispatcher<N> lfpAllocator<T, N>::dispatcher_;
template <typename T, typename U, std::size_t N, std::size_t M>
inline bool operator==(const lfpAllocator<T, N>&,
const lfpAllocator<U, M>&) noexcept {
return N == M;
}
template <typename T, typename U, std::size_t N, std::size_t M>
inline bool operator!=(const lfpAllocator<T, N>& left,
const lfpAllocator<U, M>& right) noexcept {
return !(left == right);
}
}
#endif

View File

@@ -0,0 +1,116 @@
#ifndef LF_POOL_ALLOC_CHUNK_LIST
#define LF_POOL_ALLOC_CHUNK_LIST
#include <cstdint>
#include <atomic>
#include <type_traits>
#ifndef LFP_ALLOW_BLOCKING
static_assert(ATOMIC_POINTER_LOCK_FREE == 2,
"Atomic pointer is not lock-free.");
#endif
namespace lfpAlloc {
template <std::size_t Size>
struct Cell {
uint8_t val_[Size];
Cell* next_ = this + 1;
};
// For small types (less than the size of void*), no additional
// space is needed, so union val_ with next_ to avoid overhead.
template <>
struct Cell<0> {
Cell() : next_{this + 1} {}
union {
uint8_t val_[sizeof(Cell*)];
Cell* next_;
};
};
template <std::size_t Size, std::size_t AllocationsPerChunk>
struct Chunk {
Chunk() noexcept {
auto& last = memBlock_[AllocationsPerChunk - 1];
last.next_ = nullptr;
}
Cell<Size> memBlock_[AllocationsPerChunk];
};
template <typename T>
struct Node {
Node() : val_(), next_(nullptr) {}
Node(const T& val) : val_(val), next_(nullptr) {}
T val_;
std::atomic<Node<T>*> next_;
};
template <std::size_t Size, std::size_t AllocationsPerChunk>
class ChunkList {
static constexpr auto CellSize =
(Size > sizeof(void*)) ? Size - sizeof(void*) : 0;
using Chunk_t = Chunk<CellSize, AllocationsPerChunk>;
using Cell_t = Cell<CellSize>;
using ChunkNode = Node<Chunk_t>;
using CellNode = Node<Cell_t*>;
public:
static ChunkList& getInstance() {
static ChunkList c;
return c;
}
Cell_t* allocateChain() {
CellNode* recentHead = head_.load();
CellNode* currentNext = nullptr;
do {
// If there are no available chains, allocate a new chunk
if (!recentHead) {
ChunkNode* currentHandle;
// Make a new node
auto newChunk = new ChunkNode();
// Add the chunk to the chain
do {
currentHandle = handle_.load();
newChunk->next_ = currentHandle;
} while (
!handle_.compare_exchange_weak(currentHandle, newChunk));
return &newChunk->val_.memBlock_[0];
}
currentNext = recentHead->next_;
} while (!head_.compare_exchange_weak(recentHead, currentNext));
auto retnValue = recentHead->val_;
delete recentHead;
return retnValue;
}
void deallocateChain(Cell_t* newCell) {
if (!newCell) {
return;
}
CellNode* currentHead = head_.load();
// Construct a new node to be added to the linked list
CellNode* newHead = new CellNode(newCell);
// Add the chain to the linked list
do {
newHead->next_.store(currentHead, std::memory_order_release);
} while (!head_.compare_exchange_weak(currentHead, newHead));
}
private:
ChunkList() : handle_(nullptr), head_(nullptr) {}
std::atomic<ChunkNode*> handle_;
std::atomic<CellNode*> head_;
};
}
#endif

View File

@@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2014 Adam Schwalm
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@@ -0,0 +1,48 @@
#ifndef LF_POOL_ALLOC_POOL
#define LF_POOL_ALLOC_POOL
#include <lfpAlloc/Utils.hpp>
#include <lfpAlloc/ChunkList.hpp>
namespace lfpAlloc {
template <std::size_t Size, std::size_t AllocationsPerChunk>
class Pool {
using ChunkList_t = ChunkList<Size, AllocationsPerChunk>;
public:
static constexpr auto CellSize =
(Size > sizeof(void*)) ? Size - sizeof(void*) : 0;
using Cell_t = Cell<CellSize>;
Pool() : head_(nullptr) {}
~Pool() { ChunkList_t::getInstance().deallocateChain(head_); }
void* allocate() {
// Head loaded from head_
Cell_t* currentHead = head_;
Cell_t* next;
// Out of cells to allocate
if (!currentHead) {
currentHead = ChunkList_t::getInstance().allocateChain();
}
next = currentHead->next_;
head_ = next;
return &currentHead->val_;
}
void deallocate(void* p) noexcept {
auto newHead = reinterpret_cast<Cell_t*>(p);
Cell_t* currentHead = head_;
newHead->next_ = currentHead;
head_ = newHead;
}
private:
Cell_t* head_;
};
}
#endif

View File

@@ -0,0 +1,79 @@
#ifndef LF_POOL_DISPATCHER
#define LF_POOL_DISPATCHER
#include <tuple>
#include <cassert>
#include <cstddef>
#include <lfpAlloc/Pool.hpp>
#ifndef LFP_ALLOCATIONS_PER_CHUNK
#define LFP_ALLOCATIONS_PER_CHUNK 64 * 100
#endif
namespace lfpAlloc {
namespace detail {
template <std::size_t Num, uint16_t... Ts>
struct Pools : Pools<Num - 1, alignof(std::max_align_t) * Num, Ts...> {};
template <uint16_t... Size>
struct Pools<0, Size...> {
using type = std::tuple<Pool<Size, LFP_ALLOCATIONS_PER_CHUNK>...>;
};
}
template <std::size_t NumPools>
class PoolDispatcher {
public:
void* allocate(std::size_t size) { return dispatchAllocate<0>(size); }
void deallocate(void* p, std::size_t size) noexcept {
dispatchDeallocate<0>(p, size);
}
private:
thread_local static typename detail::Pools<NumPools>::type pools_;
static_assert(NumPools > 0, "Invalid number of pools");
template <std::size_t Index>
typename std::enable_if <
Index<NumPools, void*>::type
dispatchAllocate(std::size_t const& requestSize) {
if (requestSize <= std::get<Index>(pools_).CellSize) {
return std::get<Index>(pools_).allocate();
} else {
return dispatchAllocate<Index + 1>(requestSize);
}
}
template <std::size_t Index>
typename std::enable_if<!(Index < NumPools), void*>::type
dispatchAllocate(std::size_t const&) {
assert(false && "Invalid allocation size.");
return nullptr;
}
template <std::size_t Index>
typename std::enable_if <
Index<NumPools>::type
dispatchDeallocate(void* p, std::size_t const& requestSize) noexcept {
if (requestSize <= std::get<Index>(pools_).CellSize) {
std::get<Index>(pools_).deallocate(p);
} else {
dispatchDeallocate<Index + 1>(p, requestSize);
}
}
template <std::size_t Index>
typename std::enable_if<!(Index < NumPools)>::type
dispatchDeallocate(void*, std::size_t const&) noexcept {
assert(false && "Invalid deallocation size.");
}
};
template <std::size_t NumPools>
thread_local typename detail::Pools<NumPools>::type
PoolDispatcher<NumPools>::pools_;
}
#endif

View File

@@ -0,0 +1,20 @@
#include <cstdint>
namespace lfpAlloc {
namespace detail {
template <std::size_t Val, std::size_t base = 2>
struct Log {
enum { value = 1 + Log<Val / base, base>::value };
};
template <std::size_t base>
struct Log<1, base> {
enum { value = 0 };
};
template <std::size_t base>
struct Log<0, base> {
enum { value = 0 };
};
}
}

View File

@@ -1,973 +0,0 @@
/*
Copyright (c) 2013, Alexander Tretyak
Copyright (c) 2015, r-lyeh
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of the author nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Note: The latest version of this memory allocator obtainable at
http://ltalloc.googlecode.com/hg/ltalloc.cc
Project URL: http://code.google.com/p/ltalloc
*/
#include "ltalloc.h"
#define LTALLOC_VERSION "2.0.0" /* (2015/06/16) - ltcalloc(), ltmsize(), ltrealloc(), ltmemalign(), LTALLOC_AUTO_GC_INTERVAL
#define LTALLOC_VERSION "1.0.0" /* (2015/06/16) - standard STL allocator provided [see ltalloc.hpp file](ltalloc.hpp)
#define LTALLOC_VERSION "0.0.0" /* (2013/xx/xx) - fork from public repository */
//Customizable constants
//#define LTALLOC_DISABLE_OPERATOR_NEW_OVERRIDE
//#define LTALLOC_AUTO_GC_INTERVAL 3.0
#ifndef LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO
#define LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO 2//determines how accurately size classes are spaced (i.e. when = 0, allocation requests are rounded up to the nearest power of two (2^n), when = 1, rounded to 2^n, or (2^n)*1.5, when = 2, rounded to 2^n, (2^n)*1.25, (2^n)*1.5, or (2^n)*1.75, and so on); this parameter have direct influence on memory fragmentation - bigger values lead to reducing internal fragmentation (which can be approximately estimated as pow(0.5, VALUE)*100%), but at the same time increasing external fragmentation
#endif
#define CHUNK_SIZE (64*1024)//size of chunk (basic allocation unit for all allocations of size <= MAX_BLOCK_SIZE); must be a power of two (as well as all following parameters), also should not be less than allocation granularity on Windows (which is always 64K by design of NT kernel)
#define CACHE_LINE_SIZE 64
#define MAX_NUM_OF_BLOCKS_IN_BATCH 256//maximum number of blocks to move between a thread cache and a central cache in one shot
static const unsigned int MAX_BATCH_SIZE = 64*1024;//maximum total size of blocks to move between a thread cache and a central cache in one shot (corresponds to half size of thread cache of each size class)
static const unsigned int MAX_BLOCK_SIZE = CHUNK_SIZE;//requesting memory of any size greater than this value will lead to direct call of system virtual memory allocation routine
//Platform-specific stuff
#ifdef __cplusplus
#define CPPCODE(code) code
#include <new>
#else
#define CPPCODE(code)
#endif
#ifdef LTALLOC_AUTO_GC_INTERVAL
#include <time.h>
# if LTALLOC_AUTO_GC_INTERVAL <= 0
# undef LTALLOC_AUTO_GC_INTERVAL
# define LTALLOC_AUTO_GC_INTERVAL 3.00
# endif
#endif
#ifdef __GNUC__
#define __STDC_LIMIT_MACROS
#include <stdint.h> //for SIZE_MAX
#include <limits.h> //for UINT_MAX
#define alignas(a) __attribute__((aligned(a)))
#define thread_local __thread
#define NOINLINE __attribute__((noinline))
#define CAS_LOCK(lock) __sync_lock_test_and_set(lock, 1)
#define SPINLOCK_RELEASE(lock) __sync_lock_release(lock)
#define PAUSE __asm__ __volatile__("pause" ::: "memory")
#define BSR(r, v) r = CODE3264(__builtin_clz(v) ^ 31, __builtin_clzll(v) ^ 63)//x ^ 31 = 31 - x, but gcc does not optimize 31 - __builtin_clz(x) to bsr(x), but generates 31 - (bsr(x) ^ 31)
#elif _MSC_VER
#define _ALLOW_KEYWORD_MACROS
#include <limits.h> //for SIZE_MAX and UINT_MAX
#define alignas(a) __declspec(align(a))
#define thread_local __declspec(thread)
#define NOINLINE __declspec(noinline)
#define CAS_LOCK(lock) _InterlockedExchange((long*)lock, 1)
#define SPINLOCK_RELEASE(lock) _InterlockedExchange((long*)lock, 0)
#define PAUSE _mm_pause()
#define BSR(r, v) CODE3264(_BitScanReverse, _BitScanReverse64)((unsigned long*)&r, v)
CPPCODE(extern "C") long _InterlockedExchange(long volatile *, long);
CPPCODE(extern "C") void _mm_pause();
#pragma warning(disable: 4127 4201 4324 4290)//"conditional expression is constant", "nonstandard extension used : nameless struct/union", and "structure was padded due to __declspec(align())"
#else
#error Unsupported compiler
#endif
#if __GNUC__ || __INTEL_COMPILER
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define likely(x) (x)
#define unlikely(x) (x)
#endif
static void SPINLOCK_ACQUIRE(volatile int *lock) {if (CAS_LOCK(lock)) while (*lock || CAS_LOCK(lock)) PAUSE;}
#include <assert.h>
#include <string.h> //for memset
#if SIZE_MAX == UINT_MAX
#define CODE3264(c32, c64) c32
#else
#define CODE3264(c32, c64) c64
#endif
typedef char CODE3264_check[sizeof(void*) == CODE3264(4, 8) ? 1 : -1];
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#define VMALLOC(size) VirtualAlloc(NULL, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE)
#define VMFREE(p, size) VirtualFree(p, 0, MEM_RELEASE)
#else
#include <sys/mman.h>
#include <unistd.h>
#define VMALLOC(size) (void*)(((uintptr_t)mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0)+1)&~1)//with the conversion of MAP_FAILED to 0
#define VMFREE(p, size) munmap(p, size)
static size_t page_size()
{
assert((uintptr_t)MAP_FAILED+1 == 0);//have to use dynamic check somewhere, because some gcc versions (e.g. 4.4.5) won't compile typedef char MAP_FAILED_value_static_check[(uintptr_t)MAP_FAILED+1 == 0 ? 1 : -1];
static size_t pagesize = 0;
if (!pagesize) pagesize = sysconf(_SC_PAGE_SIZE);//assuming that writing of size_t value is atomic, so this can be done safely in different simultaneously running threads
return pagesize;
}
typedef struct PTrieNode // Compressed radix tree (patricia trie) for storing sizes of system allocations
{ // This implementation have some specific properties:
uintptr_t keys[2]; // 1. There are no separate leaf nodes (with both null children), as leaf's value is stored directly in place of corresponding child node pointer. Least significant bit of that pointer used to determine its meaning (i.e., is it a value, or child node pointer).
struct PTrieNode *childNodes[2];// 2. Inserting a new element (key/value) into this tree require to create always an exactly one new node (and similarly for remove key/node operation).
} PTrieNode; // 3. Tree always contains just one node with null child (i.e. all but one nodes in any possible tree are always have two children).
#define PTRIE_NULL_NODE (PTrieNode*)(uintptr_t)1
static PTrieNode *ptrieRoot = PTRIE_NULL_NODE, *ptrieFreeNodesList = NULL, *ptrieNewAllocatedPage = NULL;
static volatile int ptrieLock = 0;
static uintptr_t ptrie_lookup(uintptr_t key)
{
PTrieNode *node = ptrieRoot;
uintptr_t *lastKey = NULL;
while (!((uintptr_t)node & 1))
{
int branch = (key >> (node->keys[0] & 0xFF)) & 1;
lastKey = &node->keys[branch];
node = node->childNodes[branch];
}
assert(lastKey && (*lastKey & ~0xFF) == key);
return (uintptr_t)node & ~1;
}
static void ptrie_insert(uintptr_t key, uintptr_t value, PTrieNode *newNode/* = (PTrieNode*)malloc(sizeof(PTrieNode))*/)
{
PTrieNode **node = &ptrieRoot, *n;
uintptr_t *prevKey = NULL, x, pkey;
unsigned int index, b;
assert(!((value & 1) | (key & 0xFF)));//check constraints for key/value
for (;;)
{
n = *node;
if (!((uintptr_t)n & 1))//not a leaf
{
int prefixEnd = n->keys[0] & 0xFF;
x = key ^ n->keys[0];// & ~0xFF;
if (!(x & (~(uintptr_t)1 << prefixEnd))) {//prefix matches, so go on
int branch = (key >> prefixEnd) & 1;
node = &n->childNodes[branch];
prevKey = &n->keys[branch];
} else {//insert a new node before current
pkey = n->keys[0] & ~0xFF;
break;
}
} else {//leaf
if (*node == PTRIE_NULL_NODE) {
*node = newNode;
newNode->keys[0] = key;//left prefixEnd = 0, so all following insertions will be before this node
newNode->childNodes[0] = (PTrieNode*)(value | 1);
newNode->childNodes[1] = PTRIE_NULL_NODE;
return;
} else {
pkey = *prevKey & ~0xFF;
x = key ^ pkey;
assert(x/*key != pkey*/ && "key already inserted");
break;
}
}
}
BSR(index, x);
b = (key >> index) & 1;
newNode->keys[b] = key;
newNode->keys[b^1] = pkey;
newNode->keys[0] |= index;
newNode->childNodes[b] = (PTrieNode*)(value | 1);
newNode->childNodes[b^1] = n;
*node = newNode;
}
static uintptr_t ptrie_remove(uintptr_t key)
{
PTrieNode **node = &ptrieRoot;
uintptr_t *pkey = NULL;
assert(ptrieRoot != PTRIE_NULL_NODE && "trie is empty!");
for (;;)
{
PTrieNode *n = *node;
int branch = (key >> (n->keys[0] & 0xFF)) & 1;
PTrieNode *cn = n->childNodes[branch];//current child node
if ((uintptr_t)cn & 1)//leaf
{
PTrieNode *other = n->childNodes[branch^1];
assert((n->keys[branch] & ~0xFF) == key);
assert(cn != PTRIE_NULL_NODE && "node's key is probably broken");
// if (other == PTRIE_NULL_NODE) *node = PTRIE_NULL_NODE; else//special handling for null child nodes is not necessary
if (((uintptr_t)other & 1) && other != PTRIE_NULL_NODE)//if other node is not a pointer
*pkey = (n->keys[branch^1] & ~0xFF) | ((*pkey) & 0xFF);
*node = other;
*(PTrieNode**)n = ptrieFreeNodesList; ptrieFreeNodesList = n;//free(n);
return (uintptr_t)cn & ~1;
}
pkey = &n->keys[branch];
node = &n->childNodes[branch];
}
}
#endif
static void *sys_aligned_alloc(size_t alignment, size_t size)
{
void *p = VMALLOC(size);//optimistically try mapping precisely the right amount before falling back to the slow method
assert(!(alignment & (alignment-1)) && "alignment must be a power of two");
if ((uintptr_t)p & (alignment-1)/* && p != MAP_FAILED*/)
{
VMFREE(p, size);
#ifdef _WIN32
{static DWORD allocationGranularity = 0;
if (!allocationGranularity) {
SYSTEM_INFO si;
GetSystemInfo(&si);
allocationGranularity = si.dwAllocationGranularity;
}
if ((uintptr_t)p < 16*1024*1024)//fill "bubbles" (reserve unaligned regions) at the beginning of virtual address space, otherwise there will be always falling back to the slow method
VirtualAlloc(p, alignment - ((uintptr_t)p & (alignment-1)), MEM_RESERVE, PAGE_NOACCESS);
do
{
p = VirtualAlloc(NULL, size + alignment - allocationGranularity, MEM_RESERVE, PAGE_NOACCESS);
if (p == NULL) return NULL;
VirtualFree(p, 0, MEM_RELEASE);//unfortunately, WinAPI doesn't support release a part of allocated region, so release a whole region
p = VirtualAlloc((void*)(((uintptr_t)p + (alignment-1)) & ~(alignment-1)), size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
} while (p == NULL);}
#else
p = VMALLOC(size + alignment - page_size());
if (p/* != MAP_FAILED*/)
{
uintptr_t ap = ((uintptr_t)p + (alignment-1)) & ~(alignment-1);
uintptr_t diff = ap - (uintptr_t)p;
if (diff) VMFREE(p, diff);
diff = alignment - page_size() - diff;
assert((intptr_t)diff >= 0);
if (diff) VMFREE((void*)(ap + size), diff);
return (void*)ap;
}
#endif
}
//if (p == 0) p = sys_aligned_alloc(alignment, size);//just in case (because 0 pointer is handled specially elsewhere)
//if (p == MAP_FAILED) p = NULL;
return p;
}
static NOINLINE void sys_free(void *p)
{
if (p == NULL) return;
#ifdef _WIN32
VirtualFree(p, 0, MEM_RELEASE);
#else
SPINLOCK_ACQUIRE(&ptrieLock);
size_t size = ptrie_remove((uintptr_t)p);
SPINLOCK_RELEASE(&ptrieLock);
munmap(p, size);
#endif
}
static void release_thread_cache(void*);
#ifdef __GNUC__
#include <pthread.h>
#pragma weak pthread_once
#pragma weak pthread_key_create
#pragma weak pthread_setspecific
static pthread_key_t pthread_key;
static pthread_once_t init_once = PTHREAD_ONCE_INIT;
static void init_pthread_key() { pthread_key_create(&pthread_key, release_thread_cache); }
static thread_local int thread_initialized = 0;
static void init_pthread_destructor()//must be called only when some block placed into a thread cache's free list
{
if (unlikely(!thread_initialized))
{
thread_initialized = 1;
if (pthread_once)
{
pthread_once(&init_once, init_pthread_key);
pthread_setspecific(pthread_key, (void*)1);//set nonzero value to force calling of release_thread_cache() on thread terminate
}
}
}
#else
static void NTAPI on_tls_callback(PVOID h, DWORD reason, PVOID pv) { h; pv; if (reason == DLL_THREAD_DETACH) release_thread_cache(0); }
#pragma comment(linker, "/INCLUDE:" CODE3264("_","") "p_thread_callback_ltalloc")
#pragma const_seg(".CRT$XLL")
extern CPPCODE("C") const PIMAGE_TLS_CALLBACK p_thread_callback_ltalloc = on_tls_callback;
#pragma const_seg()
#define init_pthread_destructor()
#endif
//End of platform-specific stuff
#define MAX_BLOCK_SIZE (MAX_BLOCK_SIZE < CHUNK_SIZE - (CHUNK_SIZE >> (1 + LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) ? \
MAX_BLOCK_SIZE : CHUNK_SIZE - (CHUNK_SIZE >> (1 + LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)))
#define NUMBER_OF_SIZE_CLASSES ((sizeof(void*)*8 + 1) << LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)
typedef struct FreeBlock
{
struct FreeBlock *next,
*nextBatch;//in the central cache blocks are organized into batches to allow fast moving blocks from thread cache and back
} FreeBlock;
typedef struct alignas(CACHE_LINE_SIZE) ChunkBase//force sizeof(Chunk) = cache line size to avoid false sharing
{
unsigned int sizeClass;
} Chunk;
typedef struct alignas(CACHE_LINE_SIZE) ChunkSm//chunk of smallest blocks of size = sizeof(void*)
{
unsigned int sizeClass;//struct ChunkBase chunk;
struct ChunkSm *prev, *next;
int numBatches;
#define NUM_OF_BATCHES_IN_CHUNK_SM CHUNK_SIZE/(sizeof(void*)*MAX_NUM_OF_BLOCKS_IN_BATCH)
FreeBlock *batches[NUM_OF_BATCHES_IN_CHUNK_SM];//batches of blocks inside ChunkSm have to be stored separately (as smallest blocks of size = sizeof(void*) do not have enough space to store second pointer for the batch)
} ChunkSm;
typedef struct alignas(CACHE_LINE_SIZE)//align needed to prevent cache line sharing between adjacent classes accessed from different threads
{
volatile int lock;
unsigned int freeBlocksInLastChunk;
char *lastChunk;//Chunk or ChunkSm
union {
FreeBlock *firstBatch;
ChunkSm *chunkWithFreeBatches;
};
FreeBlock *freeList;//short list of free blocks that for some reason are not organized into batches
unsigned int freeListSize;//should be less than batch size
uintptr_t minChunkAddr, maxChunkAddr;
} CentralCache;
static CentralCache centralCache[NUMBER_OF_SIZE_CLASSES];// = {{0}};
typedef struct
{
FreeBlock *freeList;
FreeBlock *tempList;//intermediate list providing a hysteresis in order to avoid a corner case of too frequent moving free blocks to the central cache and back from
int counter;//number of blocks in freeList (used to determine when to move free blocks list to the central cache)
} ThreadCache;
static thread_local ThreadCache threadCache[NUMBER_OF_SIZE_CLASSES];// = {{0}};
static struct
{
volatile int lock;
void *freeChunk;
size_t size;
} pad = {0, NULL, 0};
static CPPCODE(inline) unsigned int get_size_class(size_t size)
{
unsigned int index;
#if _MSC_VER && LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 2
static const unsigned char small_size_classes[256] = {//have to use a const array here, because MS compiler unfortunately does not evaluate _BitScanReverse with a constant argument at compile time (as gcc does for __builtin_clz)
#if CODE3264(1, 0)
131, 4, 15, 17, 19, 20, 21, 22, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43
#else
131, 15, 19, 21, 23, 24, 25, 26, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47
#endif
};
if (size < 256 * sizeof(void*) - (sizeof(void*)-1))
return small_size_classes[(size + (sizeof(void*)-1)) / sizeof(void*)];
#endif
size = (size + (sizeof(void*)-1)) & ~(sizeof(void*)-1);//minimum block size is sizeof(void*), doing this is better than just "size = max(size, sizeof(void*))"
BSR(index, (size-1)|1);//"|1" needed because result of BSR is undefined for zero input
#if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 0
return index;
#else
return (index<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO) + (unsigned int)((size-1) >> (index-LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO));
#endif
}
static unsigned int class_to_size(unsigned int c)
{
#if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO == 0
return 2 << c;
#else
#if LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO >= CODE3264(2, 3)
if (unlikely(c < (LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)))//for block sizes less than or equal to pow(2, LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)
return 2 << (c>>LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO);
else
#endif
{
c -= (1<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-1;
return ((c & ((1<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-1)) | (1<<LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) << ((c>>LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)-LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO);
}
#endif
}
static unsigned int batch_size(unsigned int sizeClass)//calculates a number of blocks to move between a thread cache and a central cache in one shot
{
return ((MAX_BATCH_SIZE-1) >> (sizeClass >> LTALLOC_SIZE_CLASSES_SUBPOWER_OF_TWO)) & (MAX_NUM_OF_BLOCKS_IN_BATCH-1);
}
CPPCODE(template <bool> static) void *ltmalloc(size_t size);
CPPCODE(template <bool throw_>) static void *fetch_from_central_cache(size_t size, ThreadCache *tc, unsigned int sizeClass)
{
void *p;
if (likely(size-1u <= MAX_BLOCK_SIZE-1u))//<=> if (size <= MAX_BLOCK_SIZE && size != 0)
{
FreeBlock *fb = tc->tempList;
if (fb)
{
assert(tc->counter == (int)batch_size(sizeClass)+1);
tc->counter = 1;
tc->freeList = fb->next;
tc->tempList = NULL;
return fb;
}
assert(tc->counter == 0 || tc->counter == (int)batch_size(sizeClass)+1);
tc->counter = 1;
{CentralCache *cc = &centralCache[sizeClass];
SPINLOCK_ACQUIRE(&cc->lock);
if (unlikely(!cc->firstBatch))//no free batch
{no_free_batch:{
unsigned int batchSize = batch_size(sizeClass)+1;
if (cc->freeList)
{
assert(cc->freeListSize);
if (likely(cc->freeListSize <= batchSize + 1))
{
tc->counter = batchSize - cc->freeListSize + 1;
// batchSize = cc->freeListSize;
cc->freeListSize = 0;
fb = cc->freeList;
cc->freeList = NULL;
}
else
{
cc->freeListSize -= batchSize;
fb = cc->freeList;
{FreeBlock *b = cc->freeList;
while (--batchSize) b = b->next;
cc->freeList = b->next;
b->next = NULL;}
}
SPINLOCK_RELEASE(&cc->lock);
tc->freeList = fb->next;
init_pthread_destructor();//this call must be placed carefully to allow recursive memory allocation from pthread_key_create (in case when ltalloc replaces the system malloc)
return fb;
}
{unsigned int blockSize = class_to_size(sizeClass);
if (cc->freeBlocksInLastChunk)
{
char *firstFree = cc->lastChunk;
assert(cc->lastChunk && cc->freeBlocksInLastChunk == (CHUNK_SIZE - ((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)))/blockSize);
if (cc->freeBlocksInLastChunk < batchSize) {
tc->counter = batchSize - cc->freeBlocksInLastChunk + 1;
batchSize = cc->freeBlocksInLastChunk;
}
cc->freeBlocksInLastChunk -= batchSize;
cc->lastChunk += blockSize * batchSize;
if (cc->freeBlocksInLastChunk == 0) {
assert(((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)) == 0);
cc->lastChunk = ((char**)cc->lastChunk)[-1];
if (cc->lastChunk)
cc->freeBlocksInLastChunk = (CHUNK_SIZE - ((uintptr_t)cc->lastChunk & (CHUNK_SIZE-1)))/blockSize;
}
SPINLOCK_RELEASE(&cc->lock);
fb = (FreeBlock*)firstFree;
while (--batchSize)
firstFree = (char*)(((FreeBlock*)firstFree)->next = (FreeBlock*)(firstFree + blockSize));
((FreeBlock*)firstFree)->next = NULL;
tc->freeList = fb->next;
init_pthread_destructor();
return fb;
}
//Allocate new chunk
SPINLOCK_RELEASE(&cc->lock);//release lock for a while
SPINLOCK_ACQUIRE(&pad.lock);
if (pad.freeChunk)
{
p = pad.freeChunk;
pad.freeChunk = *(void**)p;
pad.size -= CHUNK_SIZE;
SPINLOCK_RELEASE(&pad.lock);
((char**)((char*)p + CHUNK_SIZE))[-1] = 0;
} else {
SPINLOCK_RELEASE(&pad.lock);
p = sys_aligned_alloc(CHUNK_SIZE, CHUNK_SIZE);
if (unlikely(!p)) { CPPCODE(if (throw_) throw std::bad_alloc(); else) return NULL; }
}
#define CHUNK_IS_SMALL unlikely(sizeClass < get_size_class(2*sizeof(void*)))
{unsigned int numBlocksInChunk = (CHUNK_SIZE - (CHUNK_IS_SMALL ? sizeof(ChunkSm) : sizeof(Chunk)))/blockSize;
#ifndef _WIN32
//intptr_t sz = ((CHUNK_SIZE - numBlocksInChunk*blockSize) & ~(page_size()-1)) - page_size();
//if (sz > 0) mprotect((char*)p + page_size(), sz, PROT_NONE);//munmap((char*)p + page_size(), sz);//to make possible unmapping, we need to be more careful when returning memory to the system, not simply VMFREE(firstFreeChunk, CHUNK_SIZE), so let there be just mprotect
#endif
assert(((char**)((char*)p + CHUNK_SIZE))[-1] == 0);//assume that allocated memory is always zero filled (on first access); it is better not to zero it explicitly because it will lead to allocation of physical page which may never needed otherwise
if (numBlocksInChunk < batchSize) {
tc->counter = batchSize - numBlocksInChunk + 1;
batchSize = numBlocksInChunk;
}
//Prepare chunk
((Chunk*)p)->sizeClass = sizeClass;
{char *firstFree = (char*)p + CHUNK_SIZE - numBlocksInChunk*blockSize;//blocks in chunk are located in such way to achieve a maximum possible alignment
fb = (FreeBlock*)firstFree;
{int n = batchSize; while (--n)
firstFree = (char*)(((FreeBlock*)firstFree)->next = (FreeBlock*)(firstFree + blockSize));}
((FreeBlock*)firstFree)->next = NULL;
firstFree += blockSize;
SPINLOCK_ACQUIRE(&cc->lock);
if ((uintptr_t)p < cc->minChunkAddr || !cc->minChunkAddr) cc->minChunkAddr = (uintptr_t)p;
if ((uintptr_t)p > cc->maxChunkAddr ) cc->maxChunkAddr = (uintptr_t)p;
if (CHUNK_IS_SMALL)//special handling for smallest blocks of size = sizeof(void*)
{
ChunkSm *cs = (ChunkSm*)p;
cs->numBatches = 0;
//Insert new chunk right after chunkWithFreeBatches
cs->prev = cc->chunkWithFreeBatches;
if (cc->chunkWithFreeBatches) {
cs->next = cc->chunkWithFreeBatches->next;
if (cc->chunkWithFreeBatches->next) cc->chunkWithFreeBatches->next->prev = cs;
cc->chunkWithFreeBatches->next = cs;
} else {
cs->next = NULL;
cc->chunkWithFreeBatches = cs;
}
}
if (unlikely(cc->freeBlocksInLastChunk))//so happened that other thread have already allocated chunk for the same size class while the lock was released
{
//Hook pointer to the current lastChunk at the end of new chunk (another way is just put all blocks to cc->freeList which is much less effecient)
((char**)(((uintptr_t)firstFree & ~(CHUNK_SIZE-1)) + CHUNK_SIZE))[-1] = cc->lastChunk;
}
cc->freeBlocksInLastChunk = numBlocksInChunk - batchSize;
cc->lastChunk = firstFree;
}}}}}
else {
if (!CHUNK_IS_SMALL)//smallest blocks of size = sizeof(void*) are handled specially
{
fb = cc->firstBatch;
cc->firstBatch = fb->nextBatch;
}
else//size of block = sizeof(void*)
{
ChunkSm *cs = cc->chunkWithFreeBatches;
if (unlikely(cs->numBatches == 0))
{
if (unlikely(cs->prev == NULL)) goto no_free_batch;
cs = cc->chunkWithFreeBatches = cs->prev;
assert(cs->numBatches == NUM_OF_BATCHES_IN_CHUNK_SM);
}
fb = cs->batches[--cs->numBatches];
}
}
SPINLOCK_RELEASE(&cc->lock);}
tc->freeList = fb->next;
init_pthread_destructor();
return fb;
}
else//allocate block directly from the system
{
if (unlikely(size == 0)) return ltmalloc CPPCODE(<throw_>)(1);//return NULL;//doing this check here is better than on the top level
size = (size + CHUNK_SIZE-1) & ~(CHUNK_SIZE-1);
p = sys_aligned_alloc(CHUNK_SIZE, size);
#ifndef _WIN32
if (p) {
SPINLOCK_ACQUIRE(&ptrieLock);
PTrieNode *newNode;
if (ptrieFreeNodesList)
ptrieFreeNodesList = *(PTrieNode**)(newNode = ptrieFreeNodesList);
else if (ptrieNewAllocatedPage) {
newNode = ptrieNewAllocatedPage;
if (!((uintptr_t)++ptrieNewAllocatedPage & (page_size()-1)))
ptrieNewAllocatedPage = ((PTrieNode**)ptrieNewAllocatedPage)[-1];
} else {
SPINLOCK_RELEASE(&ptrieLock);
newNode = (PTrieNode*)VMALLOC(page_size());
if (unlikely(!newNode)) { CPPCODE(if (throw_) throw std::bad_alloc(); else) return NULL; }
assert(((char**)((char*)newNode + page_size()))[-1] == 0);
SPINLOCK_ACQUIRE(&ptrieLock);
((PTrieNode**)((char*)newNode + page_size()))[-1] = ptrieNewAllocatedPage;//in case if other thread also have just allocated a new page
ptrieNewAllocatedPage = newNode + 1;
}
ptrie_insert((uintptr_t)p, size, newNode);
SPINLOCK_RELEASE(&ptrieLock);
}
#endif
CPPCODE(if (throw_) if (unlikely(!p)) throw std::bad_alloc();)
return p;
}
}
CPPCODE(template <bool throw_> static) void *ltmalloc(size_t size)
{
unsigned int sizeClass = get_size_class(size);
ThreadCache *tc = &threadCache[sizeClass];
FreeBlock *fb = tc->freeList;
if (likely(fb))
{
tc->freeList = fb->next;
tc->counter++;
return fb;
}
else
return fetch_from_central_cache CPPCODE(<throw_>)(size, tc, sizeClass);
}
CPPCODE(void *ltmalloc(size_t size) {return ltmalloc<false>(size);})//for possible external usage
static void add_batch_to_central_cache(CentralCache *cc, unsigned int sizeClass, FreeBlock *batch)
{
if (!CHUNK_IS_SMALL)
{
batch->nextBatch = cc->firstBatch;
cc->firstBatch = batch;
}
else
{
ChunkSm *cs = cc->chunkWithFreeBatches;
if (unlikely(cs->numBatches == NUM_OF_BATCHES_IN_CHUNK_SM))
{
cs = cc->chunkWithFreeBatches = cc->chunkWithFreeBatches->next;
assert(cs && cs->numBatches == 0);
}
cs->batches[cs->numBatches++] = batch;
}
}
static NOINLINE void move_to_central_cache(ThreadCache *tc, unsigned int sizeClass)
{
init_pthread_destructor();//needed for cases when freed memory was allocated in the other thread and no alloc was called in this thread till its termination
tc->counter = batch_size(sizeClass);
if (tc->tempList)//move temp list to the central cache
{
CentralCache *cc = &centralCache[sizeClass];
SPINLOCK_ACQUIRE(&cc->lock);
add_batch_to_central_cache(cc, sizeClass, tc->tempList);
SPINLOCK_RELEASE(&cc->lock);
}
// else if (unlikely(!tc->freeList))//this is a first call (i.e. when counter = 0) - just initialization of counter needed
// {
// tc->counter--;
// return;
// }
tc->tempList = tc->freeList;
tc->freeList = NULL;
}
void ltfree(void *p)
{
if (likely((uintptr_t)p & (CHUNK_SIZE-1)))
{
unsigned int sizeClass = ((Chunk*)((uintptr_t)p & ~(CHUNK_SIZE-1)))->sizeClass;
ThreadCache *tc = &threadCache[sizeClass];
if (unlikely(--tc->counter < 0))
move_to_central_cache(tc, sizeClass);
((FreeBlock*)p)->next = tc->freeList;
tc->freeList = (FreeBlock*)p;
}
else
sys_free(p);
}
size_t ltmsize(void *p)
{
if (likely((uintptr_t)p & (CHUNK_SIZE-1)))
{
return class_to_size(((Chunk*)((uintptr_t)p & ~(CHUNK_SIZE-1)))->sizeClass);
}
else
{
if (p == NULL) return 0;
#ifdef _WIN32
{MEMORY_BASIC_INFORMATION mi;
VirtualQuery(p, &mi, sizeof(mi));
return mi.RegionSize;}
#else
SPINLOCK_ACQUIRE(&ptrieLock);
size_t size = ptrie_lookup((uintptr_t)p);
SPINLOCK_RELEASE(&ptrieLock);
return size;
#endif
}
}
static void release_thread_cache(void *p)
{
unsigned int sizeClass = 0; (void)p;
for (;sizeClass < NUMBER_OF_SIZE_CLASSES; sizeClass++)
{
ThreadCache *tc = &threadCache[sizeClass];
if (tc->freeList || tc->tempList)
{
FreeBlock *tail = tc->freeList;
unsigned int freeListSize = 1;
CentralCache *cc = &centralCache[sizeClass];
if (tail)
while (tail->next)//search for end of list
tail = tail->next, freeListSize++;
SPINLOCK_ACQUIRE(&cc->lock);
if (tc->tempList)
add_batch_to_central_cache(cc, sizeClass, tc->tempList);
if (tc->freeList) {//append tc->freeList to cc->freeList
tail->next = cc->freeList;
cc->freeList = tc->freeList;
assert(freeListSize == batch_size(sizeClass)+1 - tc->counter);
cc->freeListSize += freeListSize;
}
SPINLOCK_RELEASE(&cc->lock);
}
}
}
void ltsqueeze(size_t padsz)
{
unsigned int sizeClass = get_size_class(2*sizeof(void*));//skip small chunks because corresponding batches can not be efficiently detached from the central cache (if that becomes relevant, may be it worths to reimplement batches for small chunks from array to linked lists)
for (;sizeClass < NUMBER_OF_SIZE_CLASSES; sizeClass++)
{
CentralCache *cc = &centralCache[sizeClass];
if (cc->maxChunkAddr - cc->minChunkAddr <= CHUNK_SIZE)//preliminary check without lock (assume that writing to minChunkAddr/maxChunkAddr is atomic)
continue;
SPINLOCK_ACQUIRE(&cc->lock);
if (cc->maxChunkAddr - cc->minChunkAddr <= CHUNK_SIZE) {//quick check for theoretical possibility that at least one chunk is totally free
SPINLOCK_RELEASE(&cc->lock);
continue;
}
{uintptr_t minChunkAddr = cc->minChunkAddr;
size_t bufferSize = ((cc->maxChunkAddr - minChunkAddr) / CHUNK_SIZE + 1) * sizeof(short);
//Quickly detach all batches of the current size class from the central cache
unsigned int freeListSize = cc->freeListSize;
FreeBlock *firstBatch = cc->firstBatch, *freeList = cc->freeList;
cc->firstBatch = NULL;
cc->freeList = NULL;
cc->freeListSize = 0;
SPINLOCK_RELEASE(&cc->lock);
//1. Find out chunks with only free blocks via a simple counting the number of free blocks in each chunk
{char buffer[32*1024];//enough for 1GB address space
unsigned short *inChunkFreeBlocks = (unsigned short*)(bufferSize <= sizeof(buffer) ? memset(buffer, 0, bufferSize) : VMALLOC(bufferSize));
unsigned int numBlocksInChunk = (CHUNK_SIZE - (/*CHUNK_IS_SMALL ? sizeof(ChunkSm) : */sizeof(Chunk)))/class_to_size(sizeClass);
FreeBlock **pbatch, *block, **pblock;
Chunk *firstFreeChunk = NULL;
assert(numBlocksInChunk < (1U<<(sizeof(short)*8)));//in case if CHUNK_SIZE is too big that total count of blocks in it doesn't fit at short type (...may be use static_assert instead?)
if (inChunkFreeBlocks)//consider VMALLOC can fail
{
for (pbatch = &firstBatch; *pbatch; pbatch = &(*pbatch)->nextBatch)
for (block = *pbatch; block; block = block->next)
#define FREE_BLOCK(block) \
if (++inChunkFreeBlocks[((uintptr_t)block - minChunkAddr) / CHUNK_SIZE] == numBlocksInChunk)/*chunk is totally free*/\
{\
Chunk *chunk = (Chunk*)((uintptr_t)block & ~(CHUNK_SIZE-1));\
assert(chunk->sizeClass == sizeClass);/*just in case check before overwriting this info*/\
*(Chunk**)chunk = firstFreeChunk;/*put nextFreeChunk pointer right at the beginning of Chunk as there are always must be a space for one pointer before first memory block*/\
firstFreeChunk = chunk;\
}
FREE_BLOCK(block)
for (pblock = &freeList; *pblock; pblock = &(*pblock)->next)
FREE_BLOCK(*pblock)
#undef FREE_BLOCK
}
else {
for (pbatch = &firstBatch; *pbatch; pbatch = &(*pbatch)->nextBatch);
for (pblock = &freeList; *pblock; pblock = &(*pblock)->next);
}
if (firstFreeChunk)//is anything to release
{
//2. Unlink all matching blocks from the corresponding free lists
FreeBlock *additionalBatchesList = NULL, *additionalBlocksList = NULL, **abatch = &additionalBatchesList, **ablock = &additionalBlocksList;
unsigned int additionalBlocksListSize = 0, batchSize = batch_size(sizeClass)+1;
for (pbatch = &firstBatch; *pbatch;)
{
for (block = *pbatch; block; block = block->next)
if (inChunkFreeBlocks[((uintptr_t)block - minChunkAddr) / CHUNK_SIZE] == numBlocksInChunk)//if at least one block belongs to a releasable chunk, then this batch should be handled specially
{
FreeBlock *nextBatch = (*pbatch)->nextBatch;
for (block = *pbatch; block;)//re-add blocks of not-for-release chunks and organize them into another batches' list (to join it with the main later)
if (inChunkFreeBlocks[((uintptr_t)block - minChunkAddr) / CHUNK_SIZE] != numBlocksInChunk)//skip matching-for-release blocks
{
*ablock = block;
do//this loop needed only to minimize memory write operations, otherwise a simpler approach could be used (like in the next loop below)
{
ablock = &block->next;
block = block->next;
if (++additionalBlocksListSize == batchSize)
{
abatch = &(*abatch = additionalBlocksList)->nextBatch;
*abatch = NULL;
*ablock = NULL;
ablock = &additionalBlocksList;
additionalBlocksList = NULL;
additionalBlocksListSize = 0;
break;//to force *ablock = block; for starting a new batch
}
} while (block && inChunkFreeBlocks[((uintptr_t)block - minChunkAddr) / CHUNK_SIZE] != numBlocksInChunk);
}
else
block = block->next;
*ablock = NULL;
*pbatch = nextBatch;//unlink batch
goto continue_;
}
pbatch = &(*pbatch)->nextBatch;
continue_:;
}
for (block = freeList; block;)
if (inChunkFreeBlocks[((uintptr_t)block - minChunkAddr) / CHUNK_SIZE] != numBlocksInChunk)
{
//*pblock = (*pblock)->next, freeListSize--;//unlink block
ablock = &(*ablock = block)->next;
block = block->next;
*ablock = NULL;
if (++additionalBlocksListSize == batchSize)
{
abatch = &(*abatch = additionalBlocksList)->nextBatch;
*abatch = NULL;
ablock = &additionalBlocksList;
additionalBlocksList = NULL;
additionalBlocksListSize = 0;
}
}
else
block = block->next;
//Add additional lists
*abatch = *pbatch;
*pbatch = additionalBatchesList;
pblock = ablock;
freeList = additionalBlocksList;
freeListSize = additionalBlocksListSize;
//Return back all left not-for-release blocks to the central cache as quickly as possible (as other threads may want to allocate a new memory)
#define GIVE_LISTS_BACK_TO_CC \
SPINLOCK_ACQUIRE(&cc->lock);\
*pbatch = cc->firstBatch;\
cc->firstBatch = firstBatch;\
*pblock = cc->freeList;\
cc->freeList = freeList;\
cc->freeListSize += freeListSize;\
SPINLOCK_RELEASE(&cc->lock);\
if (bufferSize > sizeof(buffer)) VMFREE(inChunkFreeBlocks, bufferSize);//this better to do before 3. as kernel is likely optimized for release of just allocated range
GIVE_LISTS_BACK_TO_CC
if (padsz)
{
SPINLOCK_ACQUIRE(&pad.lock);
if (pad.size < padsz)
{
Chunk *first = firstFreeChunk, **c;
do//put off free chunks up to a specified pad size
{
c = (Chunk**)firstFreeChunk;
firstFreeChunk = *c;
pad.size += CHUNK_SIZE;
} while (pad.size < padsz && firstFreeChunk);
*c = (Chunk*)pad.freeChunk;
pad.freeChunk = first;
}
SPINLOCK_RELEASE(&pad.lock);
}
//3. Return memory to the system
while (firstFreeChunk)
{
Chunk *nextFreeChunk = *(Chunk**)firstFreeChunk;
VMFREE(firstFreeChunk, CHUNK_SIZE);
firstFreeChunk = nextFreeChunk;
}
}
else//nothing to release - just return batches back to the central cache
{
GIVE_LISTS_BACK_TO_CC
#undef GIVE_LISTS_BACK_TO_CC
}}}
}
}
#if defined(__cplusplus) && !defined(LTALLOC_DISABLE_OPERATOR_NEW_OVERRIDE)
void *operator new (size_t size) throw(std::bad_alloc) {return ltmalloc<true> (size);}
void *operator new (size_t size, const std::nothrow_t&) throw() {return ltmalloc<false>(size);}
void *operator new[](size_t size) throw(std::bad_alloc) {return ltmalloc<true> (size);}
void *operator new[](size_t size, const std::nothrow_t&) throw() {return ltmalloc<false>(size);}
void operator delete (void* p) throw() {ltfree(p);}
void operator delete (void* p, const std::nothrow_t&) throw() {ltfree(p);}
void operator delete[](void* p) throw() {ltfree(p);}
void operator delete[](void* p, const std::nothrow_t&) throw() {ltfree(p);}
#endif
/* @r-lyeh's { */
#include <string.h>
void *ltcalloc(size_t elems, size_t size) {
size *= elems;
return memset( ltmalloc( size ), 0, size );
}
void *ltmemalign( size_t align, size_t size ) {
return --align, ltmalloc( (size+align)&~align );
}
void *ltrealloc( void *ptr, size_t sz ) {
if( !ptr ) return ltmalloc( sz );
if( !sz ) return ltfree( ptr ), (void *)0;
size_t osz = ltmsize( ptr );
if( sz <= osz ) {
return ptr;
}
void *nptr = memcpy( ltmalloc(sz), ptr, osz );
ltfree( ptr );
#ifdef LTALLOC_AUTO_GC_INTERVAL
/* this is kind of compromise; the following timer is to guarantee
that memory gets wiped out at least every given seconds between consecutive
ltrealloc() calls (I am assuming frequency usage for ltrealloc() is smaller
than ltmalloc() or ltfree() too) - @r-lyeh */
clock_t now = clock();
static clock_t then = now;
if( ( double(now - then) / CLOCKS_PER_SEC ) > LTALLOC_AUTO_GC_INTERVAL ) {
ltsqueeze(0);
}
then = now;
#endif
return nptr;
}
/* } */

View File

@@ -1,14 +0,0 @@
#include <stdlib.h> /*a more portable size_t definition than stddef.h itself*/
#ifdef __cplusplus
extern "C" {
#endif
void* ltmalloc(size_t);
void ltfree(void*);
void* ltrealloc( void *, size_t );
void* ltcalloc( size_t, size_t );
void* ltmemalign( size_t, size_t );
void ltsqueeze(size_t pad); /*return memory to system (see README.md)*/
size_t ltmsize(void*);
#ifdef __cplusplus
}
#endif

View File

@@ -1,59 +0,0 @@
// based on code by Jerry Coffin (most likely Public Domain)
// - rlyeh
#pragma once
#include <stdlib.h>
#include <new>
#include <limits>
#include "ltalloc.h"
namespace lt {
template <class T>
struct allocator {
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef T* pointer;
typedef const T* const_pointer;
typedef T& reference;
typedef const T& const_reference;
typedef T value_type;
template <class U> struct rebind { typedef allocator<U> other; };
allocator() throw() {}
allocator(const allocator&) throw() {}
template <class U> allocator(const allocator<U>&) throw(){}
~allocator() throw() {}
pointer address(reference x) const { return &x; }
const_pointer address(const_reference x) const { return &x; }
pointer allocate(size_type s, void const * = 0) {
if (0 == s)
return NULL;
pointer temp = (pointer)ltmalloc(s * sizeof(T));
if (temp == NULL)
throw std::bad_alloc();
return temp;
}
void deallocate(pointer p, size_type) {
ltfree(p);
}
size_type max_size() const throw() {
return std::numeric_limits<size_t>::max() / sizeof(T);
}
void construct(pointer p, const T& val) {
new((void *)p) T(val);
}
void destroy(pointer p) {
p->~T();
}
};
}

View File

@@ -27,12 +27,11 @@ solution "objview"
kind "ConsoleApp" kind "ConsoleApp"
language "C++" language "C++"
files { "viewer.cc", "trackball.cc", "ltalloc.cc" } files { "viewer.cc", "trackball.cc" }
includedirs { "./" } includedirs { "./" }
includedirs { "../../" } includedirs { "../../" }
flags { "c++11" } flags { "c++11" }
--buildoptions { "-std=c++11" }
if _OPTIONS['clang'] then if _OPTIONS['clang'] then
toolset "clang" toolset "clang"

View File

@@ -1,12 +1,12 @@
// //
// Optimized wavefront .obj loader. // Optimized wavefront .obj loader.
// Requires ltalloc and C++11 // Requires lfpAlloc and C++11
// //
/* /*
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2012-2016 Syoyo Fujita and many contributors. Copyright (c) 2012-2017 Syoyo Fujita and many contributors.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
@@ -38,7 +38,6 @@ THE SOFTWARE.
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <unistd.h>
#endif #endif
#include <cassert> #include <cassert>
@@ -55,7 +54,7 @@ THE SOFTWARE.
#include <chrono> // C++11 #include <chrono> // C++11
#include <thread> // C++11 #include <thread> // C++11
#include "ltalloc.hpp" #include "lfpAlloc/Allocator.hpp"
namespace tinyobj_opt { namespace tinyobj_opt {
@@ -327,12 +326,12 @@ struct index_t {
}; };
typedef struct { typedef struct {
std::vector<float, lt::allocator<float> > vertices; std::vector<float, lfpAlloc::lfpAllocator<float> > vertices;
std::vector<float, lt::allocator<float> > normals; std::vector<float, lfpAlloc::lfpAllocator<float> > normals;
std::vector<float, lt::allocator<float> > texcoords; std::vector<float, lfpAlloc::lfpAllocator<float> > texcoords;
std::vector<index_t, lt::allocator<index_t> > indices; std::vector<index_t, lfpAlloc::lfpAllocator<index_t> > indices;
std::vector<int, lt::allocator<int> > face_num_verts; std::vector<int, lfpAlloc::lfpAllocator<int> > face_num_verts;
std::vector<int, lt::allocator<int> > material_ids; std::vector<int, lfpAlloc::lfpAllocator<int> > material_ids;
} attrib_t; } attrib_t;
typedef StackVector<char, 256> ShortString; typedef StackVector<char, 256> ShortString;
@@ -999,9 +998,9 @@ typedef struct {
float tx, ty; float tx, ty;
// for f // for f
std::vector<index_t, lt::allocator<index_t> > f; std::vector<index_t, lfpAlloc::lfpAllocator<index_t> > f;
// std::vector<index_t> f; // std::vector<index_t> f;
std::vector<int, lt::allocator<int> > f_num_verts; std::vector<int, lfpAlloc::lfpAllocator<int> > f_num_verts;
const char *group_name; const char *group_name;
unsigned int group_name_len; unsigned int group_name_len;
@@ -1271,7 +1270,8 @@ bool parseObj(attrib_t *attrib, std::vector<shape_t> *shapes,
auto t1 = std::chrono::high_resolution_clock::now(); auto t1 = std::chrono::high_resolution_clock::now();
std::vector<LineInfo, lt::allocator<LineInfo> > line_infos[kMaxThreads]; std::vector<LineInfo, lfpAlloc::lfpAllocator<LineInfo> >
line_infos[kMaxThreads];
for (size_t t = 0; t < static_cast<size_t>(num_threads); t++) { for (size_t t = 0; t < static_cast<size_t>(num_threads); t++) {
// Pre allocate enough memory. len / 128 / num_threads is just a heuristic // Pre allocate enough memory. len / 128 / num_threads is just a heuristic
// value. // value.
@@ -1558,8 +1558,9 @@ bool parseObj(attrib_t *attrib, std::vector<shape_t> *shapes,
index_t(vertex_index, texcoord_index, normal_index); index_t(vertex_index, texcoord_index, normal_index);
} }
for (size_t k = 0; k < commands[t][i].f_num_verts.size(); k++) { for (size_t k = 0; k < commands[t][i].f_num_verts.size(); k++) {
attrib->material_ids[face_count + k] = material_id; attrib->material_ids[face_count + k] = material_id;
attrib->face_num_verts[face_count + k] = commands[t][i].f_num_verts[k]; attrib->face_num_verts[face_count + k] =
commands[t][i].f_num_verts[k];
} }
f_count += commands[t][i].f.size(); f_count += commands[t][i].f.size();