-
Notifications
You must be signed in to change notification settings - Fork 8.1k
perf: move EG() and CG() in ZTS builds into __thread storage #22231
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
5801720
08c6319
7d846b4
a4ec0fc
6afa98f
5103bc4
f7984d4
1be279f
1809c21
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,6 +42,8 @@ typedef struct { | |
| ts_allocate_ctor ctor; | ||
| ts_allocate_dtor dtor; | ||
| size_t fast_offset; | ||
| /* When set, storage comes from __thread memory instead of being allocated by TSRM. */ | ||
| void *(*tls_addr)(void); | ||
| int done; | ||
| } tsrm_resource_type; | ||
|
|
||
|
|
@@ -163,14 +165,19 @@ TSRM_API bool tsrm_startup(int expected_threads, int expected_resources, int deb | |
|
|
||
| static void ts_free_resources(tsrm_tls_entry *thread_resources) | ||
| { | ||
| bool own_thread = thread_resources->thread_id == tsrm_thread_id(); | ||
|
|
||
| /* Need to destroy in reverse order to respect dependencies. */ | ||
| for (int i = thread_resources->count - 1; i >= 0; i--) { | ||
| if (!resource_types_table[i].done) { | ||
| if (resource_types_table[i].tls_addr && !own_thread) { | ||
| continue; | ||
| } | ||
| if (resource_types_table[i].dtor) { | ||
| resource_types_table[i].dtor(thread_resources->storage[i]); | ||
| } | ||
|
|
||
| if (!resource_types_table[i].fast_offset) { | ||
| if (!resource_types_table[i].fast_offset && !resource_types_table[i].tls_addr) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can't manually free __thread storage |
||
| free(thread_resources->storage[i]); | ||
| } | ||
| } | ||
|
|
@@ -256,7 +263,10 @@ static void tsrm_update_active_threads(void) | |
|
|
||
| p->storage = (void *) realloc(p->storage, sizeof(void *)*id_count); | ||
| for (j=p->count; j<id_count; j++) { | ||
| if (resource_types_table[j].fast_offset) { | ||
| if (resource_types_table[j].tls_addr) { | ||
| TSRM_ASSERT(p->thread_id == tsrm_thread_id()); | ||
| p->storage[j] = resource_types_table[j].tls_addr(); | ||
| } else if (resource_types_table[j].fast_offset) { | ||
| p->storage[j] = (void *) (((char*)p) + resource_types_table[j].fast_offset); | ||
| } else { | ||
| p->storage[j] = (void *) malloc(resource_types_table[j].size); | ||
|
|
@@ -301,6 +311,7 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate | |
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; | ||
|
|
||
| tsrm_update_active_threads(); | ||
|
|
@@ -359,6 +370,7 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz | |
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = *offset; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = NULL; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; | ||
|
|
||
| tsrm_update_active_threads(); | ||
|
|
@@ -368,6 +380,41 @@ TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, siz | |
| return *rsrc_id; | ||
| }/*}}}*/ | ||
|
|
||
| /* allocates a resource id whose per-thread storage is a native __thread block */ | ||
| TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor) | ||
| {/*{{{*/ | ||
| TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Obtaining a new TLS resource id, %d bytes", size)); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. function largely copied from above, looking at it now I see that size_t should be printed as %zu. |
||
|
|
||
| tsrm_mutex_lock(tsmm_mutex); | ||
|
|
||
| *rsrc_id = TSRM_SHUFFLE_RSRC_ID(id_count++); | ||
|
|
||
| if (resource_types_table_size < id_count) { | ||
| tsrm_resource_type *_tmp; | ||
| _tmp = (tsrm_resource_type *) realloc(resource_types_table, sizeof(tsrm_resource_type)*id_count); | ||
| if (!_tmp) { | ||
| TSRM_ERROR((TSRM_ERROR_LEVEL_ERROR, "Unable to allocate storage for resource")); | ||
| *rsrc_id = 0; | ||
| tsrm_mutex_unlock(tsmm_mutex); | ||
| return 0; | ||
| } | ||
| resource_types_table = _tmp; | ||
| resource_types_table_size = id_count; | ||
| } | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].size = size; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].ctor = ctor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].dtor = dtor; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].fast_offset = 0; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].tls_addr = tls_addr; | ||
| resource_types_table[TSRM_UNSHUFFLE_RSRC_ID(*rsrc_id)].done = 0; | ||
|
|
||
| tsrm_update_active_threads(); | ||
| tsrm_mutex_unlock(tsmm_mutex); | ||
|
|
||
| TSRM_ERROR((TSRM_ERROR_LEVEL_CORE, "Successfully allocated new TLS resource id %d", *rsrc_id)); | ||
| return *rsrc_id; | ||
| }/*}}}*/ | ||
|
|
||
| static void set_thread_local_storage_resource_to(tsrm_tls_entry *thread_resource) | ||
| { | ||
| tsrm_tls_set(thread_resource); | ||
|
|
@@ -397,7 +444,9 @@ static void allocate_new_resource(tsrm_tls_entry **thread_resources_ptr, THREAD_ | |
| if (resource_types_table[i].done) { | ||
| (*thread_resources_ptr)->storage[i] = NULL; | ||
| } else { | ||
| if (resource_types_table[i].fast_offset) { | ||
| if (resource_types_table[i].tls_addr) { | ||
| (*thread_resources_ptr)->storage[i] = resource_types_table[i].tls_addr(); | ||
| } else if (resource_types_table[i].fast_offset) { | ||
| (*thread_resources_ptr)->storage[i] = (void *) (((char*)(*thread_resources_ptr)) + resource_types_table[i].fast_offset); | ||
| } else { | ||
| (*thread_resources_ptr)->storage[i] = (void *) malloc(resource_types_table[i].size); | ||
|
|
@@ -486,6 +535,7 @@ TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id) | |
| * use the global pointer, we need to setup the global pointer temporarily here. */ | ||
| set_thread_local_storage_resource_to(thread_resources); | ||
| /* Free up the old resource from the old thread instance */ | ||
| thread_resources->thread_id = 0; | ||
| ts_free_resources(thread_resources); | ||
| free(thread_resources); | ||
| /* Allocate a new resource at the same point in the linked list, and relink the next pointer */ | ||
|
|
@@ -559,7 +609,7 @@ void ts_free_id(ts_rsrc_id id) | |
| if (resource_types_table[rsrc_id].dtor) { | ||
| resource_types_table[rsrc_id].dtor(p->storage[rsrc_id]); | ||
| } | ||
| if (!resource_types_table[rsrc_id].fast_offset) { | ||
| if (!resource_types_table[rsrc_id].fast_offset && !resource_types_table[rsrc_id].tls_addr) { | ||
| free(p->storage[rsrc_id]); | ||
| } | ||
| } | ||
|
|
@@ -773,7 +823,10 @@ TSRM_API void *tsrm_get_ls_cache(void) | |
| /* Returns offset of tsrm_ls_cache slot from Thread Control Block address */ | ||
| TSRM_API size_t tsrm_get_ls_cache_tcb_offset(void) | ||
| {/*{{{*/ | ||
| #if defined(__APPLE__) && defined(__x86_64__) | ||
| #if defined(TSRM_TLS_MODEL_GLOBAL_DYNAMIC) | ||
| /* No constant TCB offset under global-dynamic, can't use fast path */ | ||
| return 0; | ||
| #elif defined(__APPLE__) && defined(__x86_64__) | ||
| // TODO: Implement support for fast JIT ZTS code ??? | ||
| return 0; | ||
| #elif defined(__x86_64__) && defined(__GNUC__) && !defined(__FreeBSD__) && \ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -93,6 +93,8 @@ TSRM_API ts_rsrc_id ts_allocate_id(ts_rsrc_id *rsrc_id, size_t size, ts_allocate | |
| /* Fast resource in reserved (pre-allocated) space */ | ||
| TSRM_API void tsrm_reserve(size_t size); | ||
| TSRM_API ts_rsrc_id ts_allocate_fast_id(ts_rsrc_id *rsrc_id, size_t *offset, size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor); | ||
| /* Must be called at startup before any other thread exists. */ | ||
| TSRM_API ts_rsrc_id ts_allocate_tls_id(ts_rsrc_id *rsrc_id, void *(*tls_addr)(void), size_t size, ts_allocate_ctor ctor, ts_allocate_dtor dtor); | ||
|
|
||
| /* fetches the requested resource for the current thread */ | ||
| TSRM_API void *ts_resource_ex(ts_rsrc_id id, THREAD_T *th_id); | ||
|
|
@@ -155,9 +157,14 @@ TSRM_API bool tsrm_is_managed_thread(void); | |
| #if !__has_attribute(tls_model) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__MUSL__) || defined(__HAIKU__) | ||
| # define TSRM_TLS_MODEL_ATTR | ||
| # define TSRM_TLS_MODEL_DEFAULT | ||
| #elif __PIC__ | ||
| # define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("initial-exec"))) | ||
| # define TSRM_TLS_MODEL_INITIAL_EXEC | ||
| #elif __PIC__ && !defined(__PIE__) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a PIE program can use local exec if it's the main executable. Only shared libraries (embed, extensions) need to fall back to initial-exed. This alone would already be a small speedup (one fewer instruction per access) |
||
| # if defined(TSRM_TLS_MODEL_USE_GLOBAL_DYNAMIC) | ||
| # define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("global-dynamic"))) | ||
| # define TSRM_TLS_MODEL_GLOBAL_DYNAMIC | ||
| # else | ||
| # define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("initial-exec"))) | ||
| # define TSRM_TLS_MODEL_INITIAL_EXEC | ||
| # endif | ||
| #else | ||
| # define TSRM_TLS_MODEL_ATTR __attribute__((tls_model("local-exec"))) | ||
| # define TSRM_TLS_MODEL_LOCAL_EXEC | ||
|
|
@@ -175,17 +182,27 @@ TSRM_API bool tsrm_is_managed_thread(void); | |
| #define TSRMG_BULK_STATIC(id, type) ((type) (*((void ***) TSRMLS_CACHE))[TSRM_UNSHUFFLE_RSRC_ID(id)]) | ||
| #define TSRMG_FAST_STATIC(offset, type, element) (TSRMG_FAST_BULK_STATIC(offset, type)->element) | ||
| #define TSRMG_FAST_BULK_STATIC(offset, type) ((type) (((char*) TSRMLS_CACHE)+(offset))) | ||
| struct _zend_tsrm_ls_cache; | ||
| #if defined(ZEND_WIN32) && !defined(LIBZEND_EXPORTS) | ||
| /* Windows can't dllexport the TLS struct, so outside Zend each module | ||
| * keeps a per-module `void *` pointer and reaches EG/CG via the resource-id indirection. */ | ||
| # define ZEND_TSRMLS_CACHE_T void * | ||
| # define TSRMLS_MAIN_CACHE_DEFINE() TSRM_TLS void *_tsrm_ls_cache TSRM_TLS_MODEL_ATTR = NULL; | ||
| # define TSRMLS_CACHE_DEFINE() TSRM_TLS void *_tsrm_ls_cache = NULL; | ||
| #else | ||
| # define ZEND_TSRMLS_CACHE_T struct _zend_tsrm_ls_cache | ||
| # define TSRMLS_MAIN_CACHE_DEFINE() | ||
| # define TSRMLS_CACHE_DEFINE() | ||
| #endif | ||
| #ifdef __cplusplus | ||
| #define TSRMLS_MAIN_CACHE_EXTERN() extern "C" { extern TSRM_TLS void *TSRMLS_CACHE TSRM_TLS_MODEL_ATTR; } | ||
| #define TSRMLS_CACHE_EXTERN() extern "C" { extern TSRM_TLS void *TSRMLS_CACHE; } | ||
| #define TSRMLS_MAIN_CACHE_EXTERN() extern "C" { extern TSRM_TLS ZEND_TSRMLS_CACHE_T _tsrm_ls_cache TSRM_TLS_MODEL_ATTR; } | ||
| #define TSRMLS_CACHE_EXTERN() extern "C" { extern TSRM_TLS ZEND_TSRMLS_CACHE_T _tsrm_ls_cache; } | ||
| #else | ||
| #define TSRMLS_MAIN_CACHE_EXTERN() extern TSRM_TLS void *TSRMLS_CACHE TSRM_TLS_MODEL_ATTR; | ||
| #define TSRMLS_CACHE_EXTERN() extern TSRM_TLS void *TSRMLS_CACHE; | ||
| #define TSRMLS_MAIN_CACHE_EXTERN() extern TSRM_TLS ZEND_TSRMLS_CACHE_T _tsrm_ls_cache TSRM_TLS_MODEL_ATTR; | ||
| #define TSRMLS_CACHE_EXTERN() extern TSRM_TLS ZEND_TSRMLS_CACHE_T _tsrm_ls_cache; | ||
| #endif | ||
| #define TSRMLS_MAIN_CACHE_DEFINE() TSRM_TLS void *TSRMLS_CACHE TSRM_TLS_MODEL_ATTR = NULL; | ||
| #define TSRMLS_CACHE_DEFINE() TSRM_TLS void *TSRMLS_CACHE = NULL; | ||
| #define TSRMLS_CACHE_UPDATE() TSRMLS_CACHE = tsrm_get_ls_cache() | ||
| #define TSRMLS_CACHE _tsrm_ls_cache | ||
| #define TSRMLS_CACHE (*(void **) &_tsrm_ls_cache) | ||
|
|
||
| #ifdef __cplusplus | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.