diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c index bc5872efad5..30a5515a36e 100644 --- a/oshmem/mca/memheap/base/memheap_base_select.c +++ b/oshmem/mca/memheap/base/memheap_base_select.c @@ -26,6 +26,8 @@ #include "oshmem/mca/sshmem/base/base.h" #include "ompi/util/timings.h" +#include + mca_memheap_base_config_t mca_memheap_base_config = { .device_nic_mem_seg_size = 0 }; @@ -107,6 +109,136 @@ static size_t _memheap_size(void) return (size_t) memheap_align(oshmem_shmem_info_env.symmetric_heap_size); } +static void *memheap_mmap_get(void *hint, size_t size) +{ + void *addr; + + addr = mmap(hint, size, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + return NULL; + } + + return addr; +} + +static int memheap_exchange_base_address(size_t size, void **address) +{ + int nprocs = oshmem_num_procs(); + int need_sync = (*address == NULL); + void *base = NULL; + void *ptr = NULL; + int rc, i; + void **bases; + + bases = calloc(nprocs, sizeof(*bases)); + if (NULL == bases) { + return OSHMEM_ERROR; + } + + if (oshmem_my_proc_id() == 0) { + ptr = memheap_mmap_get(NULL, size); + base = ptr; + } + + rc = oshmem_shmem_bcast(&base, sizeof(base), 0); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to exchange allocated vma for base segment " + "(error %d)", rc); + goto out; + } + + if (oshmem_my_proc_id() != 0) { + ptr = memheap_mmap_get(base, size); + } + + MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s", + oshmem_my_proc_id(), base, + (base == ptr)? "ok" : "unavailable"); + + *address = base; + if (need_sync) { + /* They all succeed or fail to allow fallback */ + rc = oshmem_shmem_allgather(&ptr, bases, sizeof(ptr)); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to exchange selected vma for base segment " + "(error %d)", rc); + goto out; + } + + for (i = 0; i < nprocs; i++) { + if ((NULL == bases[i]) || (bases[i] != base)) { + *address = NULL; + break; + } + } + } else if (ptr != base) { + /* Any failure terminates the rank and others start teardown */ + rc = OSHMEM_ERROR; + } + +out: + if (((OSHMEM_SUCCESS != rc) || (*address == NULL)) && (ptr != NULL)) { + (void)munmap(ptr, size); + } + + free(bases); + return rc; +} + + +/* + * The returned mca_sshmem_base_start_address value is reserved by using + * mmap() for the expected size. + */ +static int memheap_base_segment_setup(size_t size) +{ + int rc; + + if ((mca_sshmem_base_start_address == (void *)UINTPTR_MAX) || + (mca_sshmem_base_start_address == NULL)) { + if (UINTPTR_MAX == 0xFFFFFFFF) { + /** + * if 32 bit we set sshmem_base_start_adress to 0 + * to let OS allocate segment automatically + */ + mca_sshmem_base_start_address = NULL; + return OSHMEM_SUCCESS; + } + + rc = memheap_exchange_base_address(size, &mca_sshmem_base_start_address); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to setup base segment address (error %d)", rc); + return rc; + } + + if (NULL != mca_sshmem_base_start_address) { + goto done; /* Region is reserved */ + } + +#if defined(__aarch64__) + mca_sshmem_base_start_address = (void*)0xAB0000000000; +#else + mca_sshmem_base_start_address = (void*)0xFF000000; +#endif + } + + if (mca_sshmem_base_start_address != memheap_mmap_get( + mca_sshmem_base_start_address, size)) { + MEMHEAP_ERROR("Failed to create segment address %p/%zu", + mca_sshmem_base_start_address, size); + return OSHMEM_ERROR; + } + +done: + if (oshmem_my_proc_id() == 0) { + MEMHEAP_VERBOSE(10, "Using symmetric segment address %p/%zu", + mca_sshmem_base_start_address, size); + } + + return OSHMEM_SUCCESS; +} + static memheap_context_t* _memheap_create(void) { int rc = OSHMEM_SUCCESS; @@ -124,13 +256,18 @@ static memheap_context_t* _memheap_create(void) OPAL_TIMING_ENV_NEXT(timing, "_memheap_size()"); - /* Inititialize symmetric area */ - if (OSHMEM_SUCCESS == rc) { - rc = mca_memheap_base_alloc_init(&mca_memheap_base_map, - user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0, - "regular_mem"); + /* Locate and reserve symmetric area */ + rc = memheap_base_segment_setup(user_size + MEMHEAP_BASE_PRIVATE_SIZE); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to negotiate base segment addres"); + return NULL; } + /* Initialize symmetric area */ + rc = mca_memheap_base_alloc_init(&mca_memheap_base_map, + user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0, + "regular_mem"); + OPAL_TIMING_ENV_NEXT(timing, "mca_memheap_base_alloc_init()"); /* Initialize atomic symmetric area */ diff --git a/oshmem/mca/memheap/base/memheap_base_static.c b/oshmem/mca/memheap/base/memheap_base_static.c index 4f1138102b4..29718699362 100644 --- a/oshmem/mca/memheap/base/memheap_base_static.c +++ b/oshmem/mca/memheap/base/memheap_base_static.c @@ -23,6 +23,9 @@ #include static int _check_perms(const char *perm); +static int _check_non_static_segment(const map_segment_t *mem_segs, + int n_segment, + const void *start, const void *end); static int _check_address(void *start, void **end); static int _check_pathname(uint64_t inode, const char *pathname); @@ -30,6 +33,7 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) { /* read and parse segments from /proc/self/maps */ int ret = OSHMEM_SUCCESS; + int n_segments = map->n_segments; uint64_t total_mem = 0; void* start; void* end; @@ -52,14 +56,6 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) return OSHMEM_ERROR; } -#ifdef __linux__ - extern unsigned _end; - if (mca_sshmem_base_start_address < (uintptr_t)&_end) { - MEMHEAP_VERBOSE(1, "sshmem base start address is inside data region" - " (%p < %p)", mca_sshmem_base_start_address, &_end); - } -#endif - while (NULL != fgets(line, sizeof(line), fp)) { if (3 > sscanf(line, "%llx-%llx %s %llx %s %llx %s", @@ -75,6 +71,12 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) goto out; } + if (OSHMEM_ERROR == _check_non_static_segment( + map->mem_segs, n_segments, + start, end)) { + continue; + } + if (OSHMEM_ERROR == _check_address(start, &end)) continue; @@ -136,6 +138,26 @@ static int _check_perms(const char *perms) return OSHMEM_ERROR; } +static int _check_non_static_segment(const map_segment_t *mem_segs, + int n_segment, + const void *start, const void *end) +{ + int i; + + for (i = 0; i < n_segment; i++) { + if ((start <= mem_segs[i].super.va_base) && + (mem_segs[i].super.va_base < end)) { + MEMHEAP_VERBOSE(100, + "non static segment: %p-%p already exists as %p-%p", + start, end, mem_segs[i].super.va_base, + mem_segs[i].super.va_end); + return OSHMEM_ERROR; + } + } + + return OSHMEM_SUCCESS; +} + static int _check_address(void *start, void **end) { /* FIXME Linux specific code */ @@ -146,11 +168,9 @@ static int _check_address(void *start, void **end) /** * SGI shmem only supports globals&static in main program. * It does not support them in shared objects or in dlopen() - * (Clarified on PGAS 2011 tutorial) + * (Clarified on PGAS 2011 tutorial). * - * So ignored any maps that start higher then process _end - * FIXME: make sure we do not register symmetric heap twice - * if we decide to allow shared objects + * So ignored any maps that start higher then process _end. */ if ((uintptr_t)start > data_end) { MEMHEAP_VERBOSE(100, diff --git a/oshmem/mca/sshmem/base/sshmem_base_open.c b/oshmem/mca/sshmem/base/sshmem_base_open.c index 08845df6de6..1f0d1eb761e 100644 --- a/oshmem/mca/sshmem/base/sshmem_base_open.c +++ b/oshmem/mca/sshmem/base/sshmem_base_open.c @@ -31,17 +31,7 @@ * globals */ -/** - * if 32 bit we set sshmem_base_start_adress to 0 - * to let OS allocate segment automatically - */ -#if UINTPTR_MAX == 0xFFFFFFFF -void *mca_sshmem_base_start_address = (void*)0; -#elif defined(__aarch64__) -void* mca_sshmem_base_start_address = (void*)0xAB0000000000; -#else -void* mca_sshmem_base_start_address = (void*)0xFF000000; -#endif +void *mca_sshmem_base_start_address = UINTPTR_MAX; char * mca_sshmem_base_backing_file_dir = NULL; diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index fe939df35d1..73565c39405 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -169,6 +169,7 @@ segment_create(map_segment_t *ds_buf, } /* Attach to the segment */ + (void)munmap(mca_sshmem_base_start_address, size); addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0); if (addr == (void *) -1L) { opal_show_help("help-oshmem-sshmem.txt", diff --git a/oshmem/runtime/oshmem_shmem_exchange.c b/oshmem/runtime/oshmem_shmem_exchange.c index 730eaef2e46..147340e53c4 100644 --- a/oshmem/runtime/oshmem_shmem_exchange.c +++ b/oshmem/runtime/oshmem_shmem_exchange.c @@ -16,6 +16,15 @@ #include "oshmem/runtime/runtime.h" #include "oshmem/runtime/params.h" +int oshmem_shmem_bcast(void *buf, int elem_size, int root) +{ + int rc; + + rc = PMPI_Bcast(buf, elem_size, MPI_BYTE, root, oshmem_comm_world); + + return rc; +} + int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size) { int rc; diff --git a/oshmem/runtime/runtime.h b/oshmem/runtime/runtime.h index 28f22f3eab7..1b19d9b8486 100644 --- a/oshmem/runtime/runtime.h +++ b/oshmem/runtime/runtime.h @@ -127,6 +127,11 @@ int oshmem_shmem_finalize(void); */ OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode); +/** + * Broadcast between all PEs + */ +OSHMEM_DECLSPEC int oshmem_shmem_bcast(void *buf, int elem_size, int root); + /** * Allgather between all PEs */