diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a6b6861f2..be56d54a9 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -20,6 +20,7 @@ - [ ] Performance enhancement (non-breaking change which improves efficiency) - [ ] Code cleanup (non-breaking change which makes code smaller or more readable) - [ ] Breaking change (fix or feature that would cause existing functionality to change) +- [ ] Testing (addition of new tests or update to current tests) - [ ] Documentation (a change to man pages or other documentation) ### Checklist: diff --git a/.gitignore b/.gitignore index c851036c6..9da40ac40 100644 --- a/.gitignore +++ b/.gitignore @@ -71,6 +71,7 @@ examples/src/*-static t/sys/open.t t/test-results/ t/unifyfs_unmount.t +t/seg_tree_test.t t/test_run_env.sh deps install diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1351251ea..3b2f27765 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,46 +1,80 @@ +# Both testing stages depend on the build stage. By using the "needs" +# keyword, we prevent the testing stages from blocking, in favor of a +# DAG. stages: + - init - build - test-unit - test-integ -cache: - paths: - - spack_ci/ +##### System Templates ##### -##### Templates ##### +# Generic system templates used to contruct the final jobs on specific +# systems within their respective .yml file. Currently +# these are LLNL specific, but can be adjusted or added to as new +# systems become available. +# +# The NNODES, WALL_TIME, and STORAGE_SIZE variables can be altered in +# Gitlab interface if/when the defaults need to be changed. -.quartz-template: &quartz_template - tags: - - quartz - - shell - variables: - LLNL_SERVICE_USER: "unifysrv" +.base-template: retry: max: 1 when: - unknown_failure - stuck_or_timeout_failure -.butte-template: &butte_template - tags: - - butte - - shell +.slurm-single-node-template: variables: - LLNL_SERVICE_USER: "unifysrv" - retry: - max: 1 - when: - - unknown_failure - - stuck_or_timeout_failure + JOB_LAUNCH_COMMAND: "srun -N1 -n1" + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N 1 -p $QUEUE -t $UNIT_WALL_TIME -J unifyfs-unit-tests" -.build-template: &build_template +.slurm-multi-node-template: + variables: + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N $NNODES -p $QUEUE -t $INTEG_WALL_TIME -J unifyfs-integ-tests" + +.lsf-single-node-template: + variables: + JOB_LAUNCH_COMMAND: "jsrun -r1 -n1" + LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes 1 -q $QUEUE -W $UNIT_WALL_TIME -J unifyfs-unit-tests" + SCHEDULER_PARAMETERS: "-nnodes 1 -P $PROJECT_ID -W $UNIT_WALL_TIME -J unifyfs-unit-tests" + +.lsf-multi-node-template: + variables: + LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes $NNODES $STAGE_STORAGE -q $QUEUE -W $INTEG_WALL_TIME -J unifyfs-integ-tests" + SCHEDULER_PARAMETERS: "-nnodes $NNODES -P $PROJECT_ID -W $INTEG_WALL_TIME -J unifyfs-integ-tests" + +##### Job Templates ##### + +# Only use this template in a pre-build job if needing to clone and +# run subsequent jobs from a non-default location. +# The WORKING_DIR envar needs to be defined in the job variables. +# +# The before_script section here overrides the default before_script +# for jobs using this template. +.init-template: + stage: init + before_script: + - mkdir -pv $WORKING_DIR + - cd $WORKING_DIR + script: + - git clone -b ${CI_COMMIT_BRANCH} --depth=1 ${CI_REPOSITORY_URL} $WORKING_DIR + +# Build script used by each system. The CC and FC variables are set in +# the specific job scripts and evaluated in the before_script in order +# to customize which compiler will be used for each job. +# An artifact is created to pass on to the testing stages. The +# test-unit stage requires the unifyfs-build/ files and the test-integ +# stage requires the unifyfs-install/ files. +.build-template: stage: build script: - ./autogen.sh - mkdir -p unifyfs-build unifyfs-install && cd unifyfs-build - - ../configure --prefix=$CI_PROJECT_DIR/unifyfs-install --enable-fortran --disable-silent-rules + - ../configure CC=$CC_PATH FC=$FC_PATH --prefix=${WORKING_DIR}/unifyfs-install --enable-fortran --disable-silent-rules - make V=1 - make V=1 install + needs: [] artifacts: name: "${CI_JOB_NAME}-${CI_PIPELINE_ID}" untracked: true @@ -49,84 +83,54 @@ cache: - unifyfs-build/ - unifyfs-install/ -.unit-test-template: &unit_test_template +.unit-test-template: stage: test-unit script: - - cd unifyfs-build/t && make check + - cd unifyfs-build/t && $JOB_LAUNCH_COMMAND make check after_script: - - rm -rf /tmp/unify* /tmp/tmp.* /tmp/mdhim* /tmp/na_sm - -.quartz-batch-variables: - variables: &quartz_batch_variables - LLNL_SLURM_SCHEDULER_PARAMETERS: "-N $NNODES -p pbatch -t $WALL_TIME" - LLNL_SERVICE_USER: "unifysrv" - CI_PROJDIR: "$CI_PROJECT_DIR" - UNIFYFS_INSTALL: "$CI_PROJECT_DIR/unifyfs-install" - CI_NPROCS: "$NPROCS" + - rm -rf /tmp/unify* /tmp/tmp.* /tmp/mdhim* /tmp/na_sm | true -.butte-batch-variables: - variables: &butte_batch_variables - LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes $NNODES -q pbatch -W $WALL_TIME" - LLNL_SERVICE_USER: "unifysrv" - CI_PROJDIR: "$CI_PROJECT_DIR" - UNIFYFS_INSTALL: "$CI_PROJECT_DIR/unifyfs-install" - CI_NPROCS: "$NPROCS" +# Variables here are used for the integration test suite and can be +# adjusted in the Gitlab interface. See our testing documentation for +# full details. +.integ-test-template: + stage: test-integ + script: + - cd t/ci && prove -v RUN_CI_TESTS.sh ##### Jobs ##### +# Since Gitlab currently runs in the user's home environment, the +# before_script is currently only set up to load the proper Spack +# modules, if they are available, to prevent changing the user's +# environment. Install any needed modules in the user's environment +# prior to running when new compilers or architectures need to be +# tested. +# +# For jobs running in the not-default location, change directories +# to the WORKING_DIR directory. Otherwise, set WORKING_DIR to be the +# CI_PROJECT_DIR for the build step. +# +# The COMPILER, CC_PATH, and FC_PATH variables are evaluated here. Set +# them in their specific job scripts. +# SPACK_COMPILER and SPACK_ARCH are then set to load the matching +# dependencies for the desired compiler. before_script: - # HERE BE DRAGONS!: Since on HPC and running as user, Spack might already - # exist and can get complicated if we install it again. - # - # check for sourced spack || check for unsourced spack in $HOME/spack and - # source it || check for cached spack, clone if none, and source it - - which spack || ((cd $HOME/spack && git describe) && . $HOME/spack/share/spack/setup-env.sh) || (((cd spack_ci && git describe) || git clone https://github.com/CamStan/spack spack_ci) && . spack_ci/share/spack/setup-env.sh) - - SPACK_ARCH=$(spack arch) - - spack install leveldb && spack load leveldb arch=$SPACK_ARCH - - spack install gotcha@0.0.2 && spack load gotcha@0.0.2 arch=$SPACK_ARCH - - spack install flatcc && spack load flatcc arch=$SPACK_ARCH - - spack install margo^mercury+bmi~boostsys && spack load argobots arch=$SPACK_ARCH && spack load mercury arch=$SPACK_ARCH && spack load margo arch=$SPACK_ARCH - -build-quartz: - <<: *quartz_template - <<: *build_template - -build-butte: - <<: *butte_template - <<: *build_template - -unit-test-quartz: - <<: *quartz_template - <<: *unit_test_template - dependencies: - - build-quartz - -unit-test-butte: - <<: *butte_template - <<: *unit_test_template - dependencies: - - build-butte - -#integ-test-quartz: -# <<: *quartz_template -# stage: test-integ -# tags: -# - quartz -# - batch -# variables: *quartz_batch_variables -# script: -# - cd t/ci && prove -v RUN_CI_TESTS.sh -# dependencies: -# - build-quartz - -integ-test-butte: - <<: *butte_template - stage: test-integ - tags: - - butte - - batch - variables: *butte_batch_variables - script: - - cd t/ci && prove -v RUN_CI_TESTS.sh - dependencies: - - build-butte + - which spack || ((cd $HOME/spack && git describe) && . $HOME/spack/share/spack/setup-env.sh) + - if [[ -d $WORKING_DIR ]]; then cd ${WORKING_DIR}; else export WORKING_DIR=${CI_PROJECT_DIR}; fi + - module load $COMPILER + - CC_PATH=$($CC_COMMAND) + - FC_PATH=$($FC_COMMAND) + - SPACK_COMPILER=${COMPILER//\//@} + - SPACK_ARCH="$(spack arch -p)-$(spack arch -o)-$(uname -m)" + - spack load gotcha %$SPACK_COMPILER arch=$SPACK_ARCH + - spack load argobots %$SPACK_COMPILER arch=$SPACK_ARCH + - spack load mercury %$SPACK_COMPILER arch=$SPACK_ARCH + - spack load margo %$SPACK_COMPILER arch=$SPACK_ARCH + - spack load spath %$SPACK_COMPILER arch=$SPACK_ARCH + +# System specific jobs +include: + - local: .gitlab/ascent.yml + - local: .gitlab/catalyst.yml + - local: .gitlab/lassen.yml diff --git a/.gitlab/ascent.yml b/.gitlab/ascent.yml new file mode 100644 index 000000000..d997f6f9b --- /dev/null +++ b/.gitlab/ascent.yml @@ -0,0 +1,54 @@ +##### Ascent Templates ##### + +# The WORKING_DIR envar is defined to allow the init job to clone the +# git repo to a different location than the default. Subsequent jobs +# will then `cd` to this directory during their before_script stage. +# The WORKING_DIR_BASE envar is definied in the Gitlab UI. +# +# The RUN_ASCENT variable can be toggled in the Gitlab interface to +# toggle whether jobs should be run on this system. +.ascent-template: + variables: + WORKING_DIR: ${WORKING_DIR_BASE}/${CI_PIPELINE_ID}/source + extends: .base-template + rules: + - if: '$RUN_ASCENT != "ON"' + when: never + - when: on_success + +.ascent-shell-template: + extends: .ascent-template + tags: [nobatch] + +.ascent-batch-template: + extends: .ascent-template + tags: [batch] + +##### All Ascent Jobs ##### + +ascent-gcc-4_8_5-init: + extends: [.ascent-shell-template, .init-template] + +ascent-gcc-4_8_5-build: + variables: + COMPILER: gcc/4.8.5 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.ascent-shell-template, .build-template] + needs: ["ascent-gcc-4_8_5-init"] + +ascent-gcc-4_8_5-unit-test: + variables: + COMPILER: gcc/4.8.5 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-single-node-template, .ascent-batch-template, .unit-test-template] + needs: ["ascent-gcc-4_8_5-build"] + +ascent-gcc-4_8_5-integ-test: + variables: + COMPILER: gcc/4.8.5 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] + needs: ["ascent-gcc-4_8_5-build"] diff --git a/.gitlab/catalyst.yml b/.gitlab/catalyst.yml new file mode 100644 index 000000000..f3533548c --- /dev/null +++ b/.gitlab/catalyst.yml @@ -0,0 +1,47 @@ +# Catalyst Templates + +# The RUN_CATALYST variable can be toggled in the Gitlab interface to +# toggle whether jobs should be run on this system. +.catalyst-template: + extends: .base-template + rules: + - if: '$RUN_CATALYST != "ON"' + when: never + - when: on_success + +.catalyst-shell-template: + extends: .catalyst-template + tags: + - catalyst + - shell + +.catalyst-batch-template: + extends: .catalyst-template + tags: + - catalyst + - batch + +##### All Catalyst Jobs ##### + +catalyst-gcc-4_9_3-build: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.catalyst-shell-template, .build-template] + +catalyst-gcc-4_9_3-unit-test: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.slurm-single-node-template, .catalyst-batch-template, .unit-test-template] + needs: ["catalyst-gcc-4_9_3-build"] + +catalyst-gcc-4_9_3-integ-test: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] + needs: ["catalyst-gcc-4_9_3-build"] diff --git a/.gitlab/lassen.yml b/.gitlab/lassen.yml new file mode 100644 index 000000000..a34a6f587 --- /dev/null +++ b/.gitlab/lassen.yml @@ -0,0 +1,47 @@ +##### Lassen Templates ##### + +# The RUN_LASSEN variable can be toggled in the Gitlab interface to +# toggle whether jobs should be run on this system. +.lassen-template: + extends: .base-template + rules: + - if: '$RUN_LASSEN != "ON"' + when: never + - when: on_success + +.lassen-shell-template: + extends: .lassen-template + tags: + - lassen + - shell + +.lassen-batch-template: + extends: .lassen-template + tags: + - lassen + - batch + +##### All Lassen Jobs ##### + +lassen-gcc-4_9_3-build: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lassen-shell-template, .build-template] + +lassen-gcc-4_9_3-unit-test: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-single-node-template, .lassen-batch-template, .unit-test-template] + needs: ["lassen-gcc-4_9_3-build"] + +lassen-gcc-4_9_3-integ-test: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] + needs: ["lassen-gcc-4_9_3-build"] diff --git a/.travis.yml b/.travis.yml index f9de90d0c..5849866d0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,13 @@ language: c -sudo: required -dist: xenial +dist: bionic +os: linux addons: apt: update: true packages: - autoconf + - autoconf-archive - automake - build-essential - cmake @@ -26,38 +27,48 @@ before_install: - | test -f $HOME/spack/etc/spack/packages.yaml || cat > $HOME/spack/etc/spack/packages.yaml << ' EOF' packages: + all: + target: [x86_64] + providers: + mpi: [openmpi] autoconf: buildable: False - paths: - autoconf@2.69: /usr + externals: + - spec: "autoconf@2.69" + prefix: /usr automake: buildable: False - paths: - automake@1.15: /usr + externals: + - spec: "automake@1.15.1" + prefix: /usr cmake: buildable: False - paths: - cmake@3.12.4: /usr/local/cmake-3.12.4 + externals: + - spec: "cmake@3.12.4" + prefix: /usr/local/cmake-3.12.4 libtool: buildable: False - paths: - libtool@2.4.6: /usr + externals: + - spec: "libtool@2.4.6" + prefix: /usr m4: buildable: False - paths: - m4@4.17: /usr + externals: + - spec: "m4@1.4.18" + prefix: /usr + openmpi: + buildable: False + externals: + - spec: "openmpi@2.1.1" + prefix: /usr EOF install: - - $HOME/spack/bin/spack install environment-modules - . $HOME/spack/share/spack/setup-env.sh - - spack install leveldb - - spack install gotcha@0.0.2 - - spack install flatcc - - spack install margo^mercury+bmi~boostsys + - spack install gotcha@1.0.3 && spack load gotcha@1.0.3 + - spack install margo^mercury@1.0.1+bmi~ofi~boostsys && spack load argobots && spack load mercury && spack load margo + - spack install spath && spack load spath # prepare build environment - - spack load environment-modules - - source <(spack module tcl loads leveldb gotcha@0.0.2 flatcc mercury argobots margo) - eval $(./scripts/git_log_test_env.sh) - export TEST_CHECKPATCH_SKIP_FILES @@ -72,9 +83,10 @@ before_cache: script: # Force git to update the shallow clone and include tags so git-describe works - git fetch --unshallow --tags - - sh autogen.sh - - ./configure --enable-fortran || cat config.log - - make -k && make distcheck + - export DISTCHECK_CONFIGURE_FLAGS="--enable-fortran" + - ./autogen.sh + - ./configure $DISTCHECK_CONFIGURE_FLAGS || cat config.log + - make distcheck - ./scripts/checkpatch.sh || test "$TEST_CHECKPATCH_ALLOW_FAILURE" = yes after_failure: diff --git a/Makefile.am b/Makefile.am index 76addc56e..5c402c5d0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,7 +2,7 @@ SUBDIRS = common meta server client examples extras t util CONFIG = ordered -#ACLOCAL_AMFLAGS = -I m4 +ACLOCAL_AMFLAGS = -I m4 pkgconfigdir = @pkgconfigdir@ pkgconfig_DATA = client/unifyfs.pc diff --git a/bootstrap.sh b/bootstrap.sh index ff66a2e99..81f1884b7 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -12,14 +12,16 @@ INSTALL_DIR=$ROOT/install cd deps repos=( https://xgitlab.cels.anl.gov/sds/bmi.git - https://github.com/google/leveldb.git https://github.com/LLNL/GOTCHA.git https://github.com/pmodels/argobots.git https://github.com/mercury-hpc/mercury.git https://xgitlab.cels.anl.gov/sds/margo.git - https://github.com/dvidelabs/flatcc.git ) +if [ $1 = "--with-leveldb" ]; then + repos+=(https://github.com/google/leveldb.git) +fi + for i in "${repos[@]}" ; do # Get just the name of the project (like "mercury") name=$(basename $i | sed 's/\.git//g') @@ -41,20 +43,24 @@ cd bmi make -j $(nproc) && make install cd .. -echo "### building leveldb ###" -cd leveldb -git checkout 1.22 -mkdir -p build && cd build -cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ - -DBUILD_SHARED_LIBS=yes .. -make -j $(nproc) && make install -cd .. -cd .. +if [ $1 = "--with-leveldb" ]; then + echo "### building leveldb ###" + cd leveldb + git checkout 1.22 + mkdir -p build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DBUILD_SHARED_LIBS=yes .. + make -j $(nproc) && make install + cd .. + cd .. +else + echo "### skipping leveldb build ###" +fi echo "### building GOTCHA ###" cd GOTCHA # Unify won't build against latest GOTCHA, so use a known compatible version. -git checkout 0.0.2 +git checkout 1.0.3 mkdir -p build && cd build cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" .. make -j $(nproc) && make install @@ -63,21 +69,25 @@ cd .. echo "### building argobots ###" cd argobots +git checkout v1.0 ./autogen.sh && CC=gcc ./configure --prefix="$INSTALL_DIR" make -j $(nproc) && make install cd .. echo "### building mercury ###" cd mercury +git checkout v1.0.1 +git submodule update --init mkdir -p build && cd build cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DMERCURY_USE_SELF_FORWARD=ON \ -DMERCURY_USE_BOOST_PP=ON \ -DMERCURY_USE_CHECKSUMS=ON \ -DMERCURY_USE_EAGER_BULK=ON \ -DMERCURY_USE_SYSTEM_MCHECKSUM=OFF \ -DNA_USE_BMI=ON \ -DMERCURY_USE_XDR=OFF \ - -DBUILD_SHARED_LIBS=on .. + -DBUILD_SHARED_LIBS=ON .. make -j $(nproc) && make install cd .. cd .. @@ -87,28 +97,19 @@ cd margo git checkout v0.4.3 export PKG_CONFIG_PATH="$INSTALL_DIR/lib/pkgconfig" ./prepare.sh -./configure --prefix="$INSTALL_DIR" +./configure --prefix="$INSTALL_DIR" --enable-shared make -j $(nproc) && make install cd .. -echo "### building flatcc ###" -cd flatcc -# need -DBUILD_SHARED_LIBS=ye -mkdir -p build && cd build -cmake -DBUILD_SHARED_LIBS=on -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" -DFLATCC_INSTALL=on .. -make -j $(nproc) && make install -cd .. -cd .. - cd "$ROOT" echo "*************************************************************************" -echo "Dependencies are all built. You can now build Unify with:" +echo "Dependencies are all built. You can now build UnifyFS with:" echo "" -echo " export PKG_CONFIG_PATH=$INSTALL_DIR/lib/pkgconfig" -echo " ./autogen.sh && ./configure --with-leveldb=$INSTALL_DIR" \ - "--with-gotcha=$INSTALL_DIR --with-flatcc=$INSTALL_DIR" +echo -n " export PKG_CONFIG_PATH=$INSTALL_DIR/lib/pkgconfig && " +echo "export LEVELDB_ROOT=$INSTALL_DIR" +echo -n " ./autogen.sh && ./configure --with-gotcha=$INSTALL_DIR" +echo " --prefix=$INSTALL_DIR" echo " make" echo "" echo "*************************************************************************" diff --git a/client/check_fns/unifyfs_list.txt b/client/check_fns/unifyfs_list.txt index 343f8bfc8..83693605a 100644 --- a/client/check_fns/unifyfs_list.txt +++ b/client/check_fns/unifyfs_list.txt @@ -8,6 +8,7 @@ int UNIFYFS_WRAP(truncate)(const char* path, off_t length) int UNIFYFS_WRAP(unlink)(const char *path) int UNIFYFS_WRAP(remove)(const char *path) int UNIFYFS_WRAP(stat)(const char *path, struct stat *buf) +int UNIFYFS_WRAP(fstat)(int fd, struct stat *buf) int UNIFYFS_WRAP(__xstat)(int vers, const char *path, struct stat *buf) int UNIFYFS_WRAP(__lxstat)(int vers, const char *path, struct stat *buf) int UNIFYFS_WRAP(creat)(const char* path, mode_t mode) diff --git a/client/src/Makefile.am b/client/src/Makefile.am index 399eb80fb..ee975cdb0 100644 --- a/client/src/Makefile.am +++ b/client/src/Makefile.am @@ -1,13 +1,16 @@ -lib_LTLIBRARIES = libunifyfs.la libunifyfs_gotcha.la +lib_LTLIBRARIES = libunifyfs.la +libunifyfsdir = $(includedir) + +if HAVE_GOTCHA +lib_LTLIBRARIES += libunifyfs_gotcha.la +libunifyfs_gotchadir = $(includedir) +endif if HAVE_FORTRAN lib_LTLIBRARIES += libunifyfsf.la endif -libunifyfsdir = $(includedir) -libunifyfs_gotchadir = $(includedir) - -AM_CFLAGS = -Wall -Wno-strict-aliasing +AM_CFLAGS = -Wall -Wno-strict-aliasing -Werror include_HEADERS = unifyfs.h @@ -16,27 +19,26 @@ include_HEADERS += unifyfsf.h endif CLIENT_COMMON_CPPFLAGS = \ + -Wall -Werror \ -I$(top_builddir)/client \ -I$(top_srcdir)/common/src CLIENT_COMMON_CFLAGS = \ + -Wall -Werror \ $(MPI_CFLAGS) \ $(MERCURY_CFLAGS) \ $(ARGOBOTS_CFLAGS) \ - $(MARGO_CFLAGS) \ - $(FLATCC_CFLAGS) + $(MARGO_CFLAGS) CLIENT_COMMON_LDFLAGS = \ -version-info $(LIBUNIFYFS_LT_VERSION) \ $(MPI_CLDFLAGS) \ - $(MARGO_LDFLAGS) \ - $(FLATCC_LDFLAGS) + $(MARGO_LDFLAGS) CLIENT_COMMON_LIBADD = \ $(top_builddir)/common/src/libunifyfs_common.la \ $(MARGO_LIBS) \ - $(FLATCC_LIBS) \ - -lcrypto -lrt -lpthread + -lrt -lpthread CLIENT_COMMON_SOURCES = \ margo_client.c \ @@ -48,8 +50,6 @@ CLIENT_COMMON_SOURCES = \ unifyfs-fixed.c \ unifyfs-fixed.h \ unifyfs-internal.h \ - unifyfs-stack.c \ - unifyfs-stack.h \ unifyfs-stdio.c \ unifyfs-stdio.h \ unifyfs-sysio.c \ @@ -69,12 +69,16 @@ libunifyfs_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_la_LIBADD = $(CLIENT_COMMON_LIBADD) -libunifyfs_gotcha_la_SOURCES = $(CLIENT_COMMON_SOURCES) gotcha_map_unifyfs_list.h +if HAVE_GOTCHA + +libunifyfs_gotcha_la_SOURCES = $(CLIENT_COMMON_SOURCES) gotcha_map_unifyfs_list.c libunifyfs_gotcha_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) -DUNIFYFS_GOTCHA libunifyfs_gotcha_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) $(GOTCHA_CFLAGS) libunifyfs_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) $(GOTCHA_LDFLAGS) libunifyfs_gotcha_la_LIBADD = $(CLIENT_COMMON_LIBADD) -lgotcha +endif + if HAVE_FORTRAN libunifyfsf_la_SOURCES = unifyfsf.c diff --git a/client/src/gotcha_map_unifyfs_list.c b/client/src/gotcha_map_unifyfs_list.c new file mode 100644 index 000000000..01e503f05 --- /dev/null +++ b/client/src/gotcha_map_unifyfs_list.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs-internal.h" +#include + +/* define gotcha-specific state to use with our wrapper */ +#define UNIFYFS_DEF(name, ret, args, argnames) \ +gotcha_wrappee_handle_t wrappee_handle_ ## name; \ +ret (*__real_ ## name) args = NULL; + +UNIFYFS_DEF(access, int, + (const char* path, int mode), + (path, mode)) +UNIFYFS_DEF(chmod, int, + (const char* path, mode_t mode), + (path, mode)) +UNIFYFS_DEF(mkdir, int, + (const char* path, mode_t mode), + (path, mode)) +UNIFYFS_DEF(rmdir, int, + (const char* path), + (path)) +UNIFYFS_DEF(chdir, int, + (const char* path), + (path)) +UNIFYFS_DEF(__getcwd_chk, char*, + (char* path, size_t size, size_t buflen), + (path, size, buflen)) +UNIFYFS_DEF(getcwd, char*, + (char* path, size_t size), + (path, size)) +UNIFYFS_DEF(getwd, char*, + (char* path), + (path)) +UNIFYFS_DEF(get_current_dir_name, char*, + (void), + ()) +UNIFYFS_DEF(rename, int, + (const char* oldpath, const char* newpath), + (oldpath, newpath)) +UNIFYFS_DEF(truncate, int, + (const char* path, off_t length), + (path, length)) +UNIFYFS_DEF(unlink, int, + (const char* path), + (path)) +UNIFYFS_DEF(remove, int, + (const char* path), + (path)) + +UNIFYFS_DEF(stat, int, + (const char* path, struct stat* buf), + (path, buf)) +UNIFYFS_DEF(fstat, int, + (int fd, struct stat* buf), + (fd, buf)) +UNIFYFS_DEF(__xstat, int, + (int vers, const char* path, struct stat* buf), + (vers, path, buf)) +UNIFYFS_DEF(__fxstat, int, + (int vers, int fd, struct stat* buf), + (vers, fd, buf)) +UNIFYFS_DEF(__lxstat, int, + (int vers, const char* path, struct stat* buf), + (vers, path, buf)) +UNIFYFS_DEF(statfs, int, + (const char* path, struct statfs* fsbuf), + (path, fsbuf)) +UNIFYFS_DEF(fstatfs, int, + (int fd, struct statfs* fsbuf), + (fd, fsbuf)) + +UNIFYFS_DEF(creat, int, + (const char* path, mode_t mode), + (path, mode)) +UNIFYFS_DEF(creat64, int, + (const char* path, mode_t mode), + (path, mode)) +UNIFYFS_DEF(open, int, + (const char* path, int flags, ...), + (path, flags)) +UNIFYFS_DEF(open64, int, + (const char* path, int flags, ...), + (path, flags)) +UNIFYFS_DEF(__open_2, int, + (const char* path, int flags, ...), + (path, flags)) + +#ifdef HAVE_LIO_LISTIO +UNIFYFS_DEF(lio_listio, int, + (int m, struct aiocb* const cblist[], int n, struct sigevent* sep), + (m, cblist, n, sep)) +#endif + +UNIFYFS_DEF(lseek, off_t, + (int fd, off_t offset, int whence), + (fd, offset, whence)) +UNIFYFS_DEF(lseek64, off64_t, + (int fd, off64_t offset, int whence), + (fd, offset, whence)) + +UNIFYFS_DEF(posix_fadvise, int, + (int fd, off_t offset, off_t len, int advice), + (fd, offset, len, advice)) + +UNIFYFS_DEF(read, ssize_t, + (int fd, void* buf, size_t count), + (fd, buf, count)) +UNIFYFS_DEF(write, ssize_t, + (int fd, const void* buf, size_t count), + (fd, buf, count)) +UNIFYFS_DEF(readv, ssize_t, + (int fd, const struct iovec* iov, int iovcnt), + (fd, iov, iovcnt)) +UNIFYFS_DEF(writev, ssize_t, + (int fd, const struct iovec* iov, int iovcnt), + (fd, iov, iovcnt)) +UNIFYFS_DEF(pread, ssize_t, + (int fd, void* buf, size_t count, off_t off), + (fd, buf, count, off)) +UNIFYFS_DEF(pread64, ssize_t, + (int fd, void* buf, size_t count, off64_t off), + (fd, buf, count, off)) +UNIFYFS_DEF(pwrite, ssize_t, + (int fd, const void* buf, size_t count, off_t off), + (fd, buf, count, off)) +UNIFYFS_DEF(pwrite64, ssize_t, + (int fd, const void* buf, size_t count, off64_t off), + (fd, buf, count, off)) +UNIFYFS_DEF(close, int, + (int fd), + (fd)) +UNIFYFS_DEF(fchdir, int, + (int fd), + (fd)) +UNIFYFS_DEF(ftruncate, int, + (int fd, off_t length), + (fd, length)) +UNIFYFS_DEF(fsync, int, + (int fd), + (fd)) +UNIFYFS_DEF(fdatasync, int, + (int fd), + (fd)) +UNIFYFS_DEF(flock, int, + (int fd, int operation), + (fd, operation)) +UNIFYFS_DEF(fchmod, int, + (int fd, mode_t mode), + (fd, mode)) + +UNIFYFS_DEF(mmap, void*, + (void* addr, size_t len, int prot, int fl, int fd, off_t off), + (addr, len, prot, fl, fd, off)) +UNIFYFS_DEF(msync, int, + (void* addr, size_t length, int flags), + (addr, length, flags)) +UNIFYFS_DEF(munmap, int, + (void* addr, size_t length), + (addr, length)) +UNIFYFS_DEF(mmap64, void*, + (void* addr, size_t len, int prot, int fl, int fd, off_t off), + (addr, len, prot, fl, fd, off)) + +UNIFYFS_DEF(opendir, DIR*, + (const char* name), + (name)) +UNIFYFS_DEF(fdopendir, DIR*, + (int fd), + (fd)) +UNIFYFS_DEF(closedir, int, + (DIR* dirp), + (dirp)) +UNIFYFS_DEF(readdir, struct dirent*, + (DIR* dirp), + (dirp)) +UNIFYFS_DEF(rewinddir, void, + (DIR* dirp), + (dirp)) +UNIFYFS_DEF(dirfd, int, + (DIR* dirp), + (dirp)) +UNIFYFS_DEF(telldir, long, + (DIR* dirp), + (dirp)) +UNIFYFS_DEF(scandir, int, + (const char* dirp, struct dirent** namelist, + int (*filter)(const struct dirent*), + int (*compar)(const struct dirent**, const struct dirent**)), + (dirp, namelist, filter, compar)) +UNIFYFS_DEF(seekdir, void, + (DIR* dirp, long loc), + (dirp, loc)) + +UNIFYFS_DEF(fopen, FILE*, + (const char* path, const char* mode), + (path, mode)) +UNIFYFS_DEF(freopen, FILE*, + (const char* path, const char* mode, FILE* stream), + (path, mode, stream)) +UNIFYFS_DEF(setvbuf, int, + (FILE* stream, char* buf, int type, size_t size), + (stream, buf, type, size)) +UNIFYFS_DEF(setbuf, void, + (FILE* stream, char* buf), + (stream, buf)) +UNIFYFS_DEF(ungetc, int, + (int c, FILE* stream), + (c, stream)) +UNIFYFS_DEF(fgetc, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(fputc, int, + (int c, FILE* stream), + (c, stream)) +UNIFYFS_DEF(getc, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(putc, int, + (int c, FILE* stream), + (c, stream)) +UNIFYFS_DEF(fgets, char*, + (char* s, int n, FILE* stream), + (s, n, stream)) +UNIFYFS_DEF(fputs, int, + (const char* s, FILE* stream), + (s, stream)) +UNIFYFS_DEF(fread, size_t, + (void* ptr, size_t size, size_t nitems, FILE* stream), + (ptr, size, nitems, stream)) +UNIFYFS_DEF(fwrite, size_t, + (const void* ptr, size_t size, size_t nitems, FILE* stream), + (ptr, size, nitems, stream)) +UNIFYFS_DEF(fprintf, int, + (FILE* stream, const char* format, ...), + (stream, format)) +UNIFYFS_DEF(fscanf, int, + (FILE* stream, const char* format, ...), + (stream, format)) +UNIFYFS_DEF(vfprintf, int, + (FILE* stream, const char* format, va_list args), + (stream, format, args)) +UNIFYFS_DEF(vfscanf, int, + (FILE* stream, const char* format, va_list args), + (stream, format, args)) +UNIFYFS_DEF(fseek, int, + (FILE* stream, long offset, int whence), + (stream, offset, whence)) +UNIFYFS_DEF(fseeko, int, + (FILE* stream, off_t offset, int whence), + (stream, offset, whence)) +UNIFYFS_DEF(ftell, long, + (FILE* stream), + (stream)) +UNIFYFS_DEF(ftello, off_t, + (FILE* stream), + (stream)) +UNIFYFS_DEF(rewind, void, + (FILE* stream), + (stream)) +UNIFYFS_DEF(fgetpos, int, + (FILE* stream, fpos_t* pos), + (stream, pos)) +UNIFYFS_DEF(fsetpos, int, + (FILE* stream, const fpos_t* pos), + (stream, pos)) +UNIFYFS_DEF(fflush, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(feof, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(ferror, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(clearerr, void, + (FILE* stream), + (stream)) +UNIFYFS_DEF(fileno, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(fclose, int, + (FILE* stream), + (stream)) +UNIFYFS_DEF(fwprintf, int, + (FILE* stream, const wchar_t* format, ...), + (stream, format)) +UNIFYFS_DEF(fwscanf, int, + (FILE* stream, const wchar_t* format, ...), + (stream, format)) +UNIFYFS_DEF(vfwprintf, int, + (FILE* stream, const wchar_t* format, va_list args), + (stream, format, args)) +UNIFYFS_DEF(vfwscanf, int, + (FILE* stream, const wchar_t* format, va_list args), + (stream, format, args)) +UNIFYFS_DEF(fgetwc, wint_t, + (FILE* stream), + (stream)) +UNIFYFS_DEF(fgetws, wchar_t*, + (wchar_t* s, int n, FILE* stream), + (s, n, stream)) +UNIFYFS_DEF(fputwc, wint_t, + (wchar_t wc, FILE* stream), + (wc, stream)) +UNIFYFS_DEF(fputws, int, + (const wchar_t* s, FILE* stream), + (s, stream)) +UNIFYFS_DEF(fwide, int, + (FILE* stream, int mode), + (stream, mode)) +UNIFYFS_DEF(getwc, wint_t, + (FILE* stream), + (stream)) +UNIFYFS_DEF(putwc, wint_t, + (wchar_t c, FILE* stream), + (c, stream)) +UNIFYFS_DEF(ungetwc, wint_t, + (wint_t c, FILE* stream), + (c, stream)) + +struct gotcha_binding_t unifyfs_wrappers[] = { + { "access", UNIFYFS_WRAP(access), &wrappee_handle_access }, + { "chmod", UNIFYFS_WRAP(chmod), &wrappee_handle_chmod }, + { "mkdir", UNIFYFS_WRAP(mkdir), &wrappee_handle_mkdir }, + { "rmdir", UNIFYFS_WRAP(rmdir), &wrappee_handle_rmdir }, + { "chdir", UNIFYFS_WRAP(chdir), &wrappee_handle_chdir }, + { "__getcwd_chk", UNIFYFS_WRAP(__getcwd_chk), + &wrappee_handle___getcwd_chk }, + { "getcwd", UNIFYFS_WRAP(getcwd), &wrappee_handle_getcwd }, + { "getwd", UNIFYFS_WRAP(getwd), &wrappee_handle_getwd }, + { "get_current_dir_name", UNIFYFS_WRAP(get_current_dir_name), + &wrappee_handle_get_current_dir_name }, + { "rename", UNIFYFS_WRAP(rename), &wrappee_handle_rename }, + { "truncate", UNIFYFS_WRAP(truncate), &wrappee_handle_truncate }, + { "unlink", UNIFYFS_WRAP(unlink), &wrappee_handle_unlink }, + { "remove", UNIFYFS_WRAP(remove), &wrappee_handle_remove }, + { "stat", UNIFYFS_WRAP(stat), &wrappee_handle_stat }, + { "fstat", UNIFYFS_WRAP(fstat), &wrappee_handle_fstat }, + { "__xstat", UNIFYFS_WRAP(__xstat), &wrappee_handle___xstat }, + { "__fxstat", UNIFYFS_WRAP(__fxstat), &wrappee_handle___fxstat }, + { "__lxstat", UNIFYFS_WRAP(__lxstat), &wrappee_handle___lxstat }, + { "statfs", UNIFYFS_WRAP(statfs), &wrappee_handle_statfs }, + { "fstatfs", UNIFYFS_WRAP(fstatfs), &wrappee_handle_fstatfs }, + { "creat", UNIFYFS_WRAP(creat), &wrappee_handle_creat }, + { "creat64", UNIFYFS_WRAP(creat64), &wrappee_handle_creat64 }, + { "open", UNIFYFS_WRAP(open), &wrappee_handle_open }, + { "open64", UNIFYFS_WRAP(open64), &wrappee_handle_open64 }, + { "__open_2", UNIFYFS_WRAP(__open_2), &wrappee_handle___open_2 }, +#ifdef HAVE_LIO_LISTIO + { "lio_listio", UNIFYFS_WRAP(lio_listio), &wrappee_handle_lio_listio }, +#endif + { "lseek", UNIFYFS_WRAP(lseek), &wrappee_handle_lseek }, + { "lseek64", UNIFYFS_WRAP(lseek64), &wrappee_handle_lseek64 }, + { "posix_fadvise", UNIFYFS_WRAP(posix_fadvise), &wrappee_handle_posix_fadvise }, + { "read", UNIFYFS_WRAP(read), &wrappee_handle_read }, + { "write", UNIFYFS_WRAP(write), &wrappee_handle_write }, + { "readv", UNIFYFS_WRAP(readv), &wrappee_handle_readv }, + { "writev", UNIFYFS_WRAP(writev), &wrappee_handle_writev }, + { "pread", UNIFYFS_WRAP(pread), &wrappee_handle_pread }, + { "pread64", UNIFYFS_WRAP(pread64), &wrappee_handle_pread64 }, + { "pwrite", UNIFYFS_WRAP(pwrite), &wrappee_handle_pwrite }, + { "pwrite64", UNIFYFS_WRAP(pwrite64), &wrappee_handle_pwrite64 }, + { "fchdir", UNIFYFS_WRAP(fchdir), &wrappee_handle_fchdir }, + { "ftruncate", UNIFYFS_WRAP(ftruncate), &wrappee_handle_ftruncate }, + { "fsync", UNIFYFS_WRAP(fsync), &wrappee_handle_fsync }, + { "fdatasync", UNIFYFS_WRAP(fdatasync), &wrappee_handle_fdatasync }, + { "flock", UNIFYFS_WRAP(flock), &wrappee_handle_flock }, + { "fchmod", UNIFYFS_WRAP(fchmod), &wrappee_handle_fchmod }, + { "mmap", UNIFYFS_WRAP(mmap), &wrappee_handle_mmap }, + { "msync", UNIFYFS_WRAP(msync), &wrappee_handle_msync }, + { "munmap", UNIFYFS_WRAP(munmap), &wrappee_handle_munmap }, + { "mmap64", UNIFYFS_WRAP(mmap64), &wrappee_handle_mmap64 }, + { "close", UNIFYFS_WRAP(close), &wrappee_handle_close }, + { "opendir", UNIFYFS_WRAP(opendir), &wrappee_handle_opendir }, + { "fdopendir", UNIFYFS_WRAP(fdopendir), &wrappee_handle_fdopendir }, + { "closedir", UNIFYFS_WRAP(closedir), &wrappee_handle_closedir }, + { "readdir", UNIFYFS_WRAP(readdir), &wrappee_handle_readdir }, + { "rewinddir", UNIFYFS_WRAP(rewinddir), &wrappee_handle_rewinddir }, + { "dirfd", UNIFYFS_WRAP(dirfd), &wrappee_handle_dirfd }, + { "telldir", UNIFYFS_WRAP(telldir), &wrappee_handle_telldir }, + { "scandir", UNIFYFS_WRAP(scandir), &wrappee_handle_scandir }, + { "seekdir", UNIFYFS_WRAP(seekdir), &wrappee_handle_seekdir }, + { "fopen", UNIFYFS_WRAP(fopen), &wrappee_handle_fopen }, + { "freopen", UNIFYFS_WRAP(freopen), &wrappee_handle_freopen }, + { "setvbuf", UNIFYFS_WRAP(setvbuf), &wrappee_handle_setvbuf }, + { "setbuf", UNIFYFS_WRAP(setbuf), &wrappee_handle_setbuf }, + { "ungetc", UNIFYFS_WRAP(ungetc), &wrappee_handle_ungetc }, + { "fgetc", UNIFYFS_WRAP(fgetc), &wrappee_handle_fgetc }, + { "fputc", UNIFYFS_WRAP(fputc), &wrappee_handle_fputc }, + { "getc", UNIFYFS_WRAP(getc), &wrappee_handle_getc }, + { "putc", UNIFYFS_WRAP(putc), &wrappee_handle_putc }, + { "fgets", UNIFYFS_WRAP(fgets), &wrappee_handle_fgets }, + { "fputs", UNIFYFS_WRAP(fputs), &wrappee_handle_fputs }, + { "fread", UNIFYFS_WRAP(fread), &wrappee_handle_fread }, + { "fwrite", UNIFYFS_WRAP(fwrite), &wrappee_handle_fwrite }, + { "fprintf", UNIFYFS_WRAP(fprintf), &wrappee_handle_fprintf }, + { "vfprintf", UNIFYFS_WRAP(vfprintf), &wrappee_handle_vfprintf }, + { "fscanf", UNIFYFS_WRAP(fscanf), &wrappee_handle_fscanf }, + { "vfscanf", UNIFYFS_WRAP(vfscanf), &wrappee_handle_vfscanf }, + { "fseek", UNIFYFS_WRAP(fseek), &wrappee_handle_fseek }, + { "fseeko", UNIFYFS_WRAP(fseeko), &wrappee_handle_fseeko }, + { "ftell", UNIFYFS_WRAP(ftell), &wrappee_handle_ftell }, + { "ftello", UNIFYFS_WRAP(ftello), &wrappee_handle_ftello }, + { "rewind", UNIFYFS_WRAP(rewind), &wrappee_handle_rewind }, + { "fgetpos", UNIFYFS_WRAP(fgetpos), &wrappee_handle_fgetpos }, + { "fsetpos", UNIFYFS_WRAP(fsetpos), &wrappee_handle_fsetpos }, + { "fflush", UNIFYFS_WRAP(fflush), &wrappee_handle_fflush }, + { "feof", UNIFYFS_WRAP(feof), &wrappee_handle_feof }, + { "ferror", UNIFYFS_WRAP(ferror), &wrappee_handle_ferror }, + { "clearerr", UNIFYFS_WRAP(clearerr), &wrappee_handle_clearerr }, + { "fileno", UNIFYFS_WRAP(fileno), &wrappee_handle_fileno }, + { "fclose", UNIFYFS_WRAP(fclose), &wrappee_handle_fclose }, + { "fwprintf", UNIFYFS_WRAP(fwprintf), &wrappee_handle_fwprintf }, + { "fwscanf", UNIFYFS_WRAP(fwscanf), &wrappee_handle_fwscanf }, + { "vfwprintf", UNIFYFS_WRAP(vfwprintf), &wrappee_handle_vfwprintf }, + { "vfwscanf", UNIFYFS_WRAP(vfwscanf), &wrappee_handle_vfwscanf }, + { "fgetwc", UNIFYFS_WRAP(fgetwc), &wrappee_handle_fgetwc }, + { "fgetws", UNIFYFS_WRAP(fgetws), &wrappee_handle_fgetws }, + { "fputwc", UNIFYFS_WRAP(fputwc), &wrappee_handle_fputwc }, + { "fputws", UNIFYFS_WRAP(fputws), &wrappee_handle_fputws }, + { "fwide", UNIFYFS_WRAP(fwide), &wrappee_handle_fwide }, + { "getwc", UNIFYFS_WRAP(getwc), &wrappee_handle_getwc }, + { "putwc", UNIFYFS_WRAP(putwc), &wrappee_handle_putwc }, + { "ungetwc", UNIFYFS_WRAP(ungetwc), &wrappee_handle_ungetwc }, +}; + +#define GOTCHA_NFUNCS (sizeof(unifyfs_wrappers) / sizeof(gotcha_binding_t)) + +int setup_gotcha_wrappers(void) +{ + /* insert our I/O wrappers using gotcha */ + enum gotcha_error_t result; + result = gotcha_wrap(unifyfs_wrappers, GOTCHA_NFUNCS, "unifyfs"); + if (result != GOTCHA_SUCCESS) { + LOGERR("gotcha_wrap() returned %d", (int) result); + if (result == GOTCHA_FUNCTION_NOT_FOUND) { + /* one or more functions were not found */ + void* fn; + gotcha_wrappee_handle_t* hdlptr; + for (int i = 0; i < GOTCHA_NFUNCS; i++) { + hdlptr = unifyfs_wrappers[i].function_handle; + fn = gotcha_get_wrappee(*hdlptr); + if (NULL == fn) { + LOGWARN("Gotcha failed to wrap function '%s'", + unifyfs_wrappers[i].name); + } + } + } else { + return UNIFYFS_ERROR_GOTCHA; + } + } + return UNIFYFS_SUCCESS; +} diff --git a/client/src/gotcha_map_unifyfs_list.h b/client/src/gotcha_map_unifyfs_list.h deleted file mode 100644 index 412b560f9..000000000 --- a/client/src/gotcha_map_unifyfs_list.h +++ /dev/null @@ -1,202 +0,0 @@ -//Generated by translate.py - -#include -#include - -UNIFYFS_DEF(access, int, (const char* path, int mode)); -UNIFYFS_DEF(chmod, int, (const char* path, mode_t mode)); -UNIFYFS_DEF(fchmod, int, (int fd, mode_t mode)); -UNIFYFS_DEF(mkdir, int, (const char* path, mode_t mode)); -UNIFYFS_DEF(rmdir, int, (const char* path)); -UNIFYFS_DEF(rename, int, (const char* oldpath, const char* newpath)); -UNIFYFS_DEF(truncate, int, (const char* path, off_t length)); -UNIFYFS_DEF(unlink, int, (const char* path)); -UNIFYFS_DEF(remove, int, (const char* path)); -UNIFYFS_DEF(stat, int, (const char* path, struct stat* buf)); -UNIFYFS_DEF(__xstat, int, (int vers, const char* path, struct stat* buf)); -UNIFYFS_DEF(__lxstat, int, (int vers, const char* path, struct stat* buf)); -UNIFYFS_DEF(creat, int, (const char* path, mode_t mode)); -UNIFYFS_DEF(creat64, int, (const char* path, mode_t mode)); -UNIFYFS_DEF(open, int, (const char* path, int flags, ...)); -UNIFYFS_DEF(open64, int, (const char* path, int flags, ...)); -UNIFYFS_DEF(__open_2, int, (const char* path, int flags, ...)); -UNIFYFS_DEF(lio_listio, int, (int mode, struct aiocb* const aiocb_list[], - int nitems, struct sigevent* sevp)); -UNIFYFS_DEF(lseek, off_t, (int fd, off_t offset, int whence)); -UNIFYFS_DEF(lseek64, off64_t, (int fd, off64_t offset, int whence)); -UNIFYFS_DEF(posix_fadvise, int, (int fd, off_t offset, off_t len, int advice)); -UNIFYFS_DEF(read, ssize_t, (int fd, void* buf, size_t count)); -UNIFYFS_DEF(write, ssize_t, (int fd, const void* buf, size_t count)); -UNIFYFS_DEF(readv, ssize_t, (int fd, const struct iovec* iov, int iovcnt)); -UNIFYFS_DEF(writev, ssize_t, (int fd, const struct iovec* iov, int iovcnt)); -UNIFYFS_DEF(pread, ssize_t, (int fd, void* buf, size_t count, off_t offset)); -UNIFYFS_DEF(pread64, ssize_t, (int fd, void* buf, size_t count, - off64_t offset)); -UNIFYFS_DEF(pwrite, ssize_t, (int fd, const void* buf, size_t count, - off_t offset)); -UNIFYFS_DEF(pwrite64, ssize_t, (int fd, const void* buf, size_t count, - off64_t offset)); -UNIFYFS_DEF(ftruncate, int, (int fd, off_t length)); -UNIFYFS_DEF(fsync, int, (int fd)); -UNIFYFS_DEF(fdatasync, int, (int fd)); -UNIFYFS_DEF(flock, int, (int fd, int operation)); -UNIFYFS_DEF(mmap, void*, (void* addr, size_t length, int prot, int flags, - int fd, off_t offset)); -UNIFYFS_DEF(msync, int, (void* addr, size_t length, int flags)); -UNIFYFS_DEF(munmap, int, (void* addr, size_t length)); -UNIFYFS_DEF(mmap64, void*, (void* addr, size_t length, int prot, int flags, - int fd, off_t offset)); -UNIFYFS_DEF(__fxstat, int, (int vers, int fd, struct stat* buf)); -UNIFYFS_DEF(close, int, (int fd)); -UNIFYFS_DEF(opendir, DIR*, (const char* name)); -UNIFYFS_DEF(fdopendir, DIR*, (int fd)); -UNIFYFS_DEF(closedir, int, (DIR* dirp)); -UNIFYFS_DEF(readdir, struct dirent*, (DIR* dirp)); -UNIFYFS_DEF(rewinddir, void, (DIR* dirp)); -UNIFYFS_DEF(dirfd, int, (DIR* dirp)); -UNIFYFS_DEF(telldir, long, (DIR* dirp)); -UNIFYFS_DEF(scandir, int, (const char* dirp, struct dirent** namelist, - int (*filter)(const struct dirent*), - int (*compar)(const struct dirent**, - const struct dirent**))); -UNIFYFS_DEF(seekdir, void, (DIR* dirp, long loc)); -UNIFYFS_DEF(fopen, FILE*, (const char* path, const char* mode)); -UNIFYFS_DEF(freopen, FILE*, (const char* path, const char* mode, - FILE* stream)); -UNIFYFS_DEF(setvbuf, int, (FILE* stream, char* buf, int type, size_t size)); -UNIFYFS_DEF(setbuf, void, (FILE* stream, char* buf)); -UNIFYFS_DEF(ungetc, int, (int c, FILE* stream)); -UNIFYFS_DEF(fgetc, int, (FILE* stream)); -UNIFYFS_DEF(fputc, int, (int c, FILE* stream)); -UNIFYFS_DEF(getc, int, (FILE* stream)); -UNIFYFS_DEF(putc, int, (int c, FILE* stream)); -UNIFYFS_DEF(fgets, char*, (char* s, int n, FILE* stream)); -UNIFYFS_DEF(fputs, int, (const char* s, FILE* stream)); -UNIFYFS_DEF(fread, size_t, (void* ptr, size_t size, size_t nitems, - FILE* stream)); -UNIFYFS_DEF(fwrite, size_t, (const void* ptr, size_t size, size_t nitems, - FILE* stream)); -UNIFYFS_DEF(fprintf, int, (FILE* stream, const char* format, ...)); -UNIFYFS_DEF(vfprintf, int, (FILE* stream, const char* format, va_list ap)); -UNIFYFS_DEF(fscanf, int, (FILE* stream, const char* format, ...)); -UNIFYFS_DEF(vfscanf, int, (FILE* stream, const char* format, va_list ap)); -UNIFYFS_DEF(fseek, int, (FILE* stream, long offset, int whence)); -UNIFYFS_DEF(fseeko, int, (FILE* stream, off_t offset, int whence)); -UNIFYFS_DEF(ftell, long, (FILE* stream)); -UNIFYFS_DEF(ftello, off_t, (FILE* stream)); -UNIFYFS_DEF(rewind, void, (FILE* stream)); -UNIFYFS_DEF(fgetpos, int, (FILE* stream, fpos_t* pos)); -UNIFYFS_DEF(fsetpos, int, (FILE* stream, const fpos_t* pos)); -UNIFYFS_DEF(fflush, int, (FILE* stream)); -UNIFYFS_DEF(feof, int, (FILE* stream)); -UNIFYFS_DEF(ferror, int, (FILE* stream)); -UNIFYFS_DEF(clearerr, void, (FILE* stream)); -UNIFYFS_DEF(fileno, int, (FILE* stream)); -UNIFYFS_DEF(fclose, int, (FILE* stream)); -UNIFYFS_DEF(fwprintf, int, (FILE* stream, const wchar_t* format, ...)); -UNIFYFS_DEF(fwscanf, int, (FILE* stream, const wchar_t* format, ...)); -UNIFYFS_DEF(vfwprintf, int, (FILE* stream, const wchar_t* format, va_list arg)); -UNIFYFS_DEF(vfwscanf, int, (FILE* stream, const wchar_t* format, va_list arg)); -UNIFYFS_DEF(fgetwc, wint_t, (FILE* stream)); -UNIFYFS_DEF(fgetws, wchar_t*, (wchar_t* s, int n, FILE* stream)); -UNIFYFS_DEF(fputwc, wint_t, (wchar_t wc, FILE* stream)); -UNIFYFS_DEF(fputws, int, (const wchar_t* s, FILE* stream)); -UNIFYFS_DEF(fwide, int, (FILE* stream, int mode)); -UNIFYFS_DEF(getwc, wint_t, (FILE* stream)); -UNIFYFS_DEF(putwc, wint_t, (wchar_t c, FILE* stream)); -UNIFYFS_DEF(ungetwc, wint_t, (wint_t c, FILE* stream)); - -struct gotcha_binding_t wrap_unifyfs_list[] = { - { "access", UNIFYFS_WRAP(access), &UNIFYFS_REAL(access) }, - { "chmod", UNIFYFS_WRAP(chmod), &UNIFYFS_REAL(chmod) }, - { "fchmod", UNIFYFS_WRAP(fchmod), &UNIFYFS_REAL(fchmod) }, - { "mkdir", UNIFYFS_WRAP(mkdir), &UNIFYFS_REAL(mkdir) }, - { "rmdir", UNIFYFS_WRAP(rmdir), &UNIFYFS_REAL(rmdir) }, - { "rename", UNIFYFS_WRAP(rename), &UNIFYFS_REAL(rename) }, - { "truncate", UNIFYFS_WRAP(truncate), &UNIFYFS_REAL(truncate) }, - { "unlink", UNIFYFS_WRAP(unlink), &UNIFYFS_REAL(unlink) }, - { "remove", UNIFYFS_WRAP(remove), &UNIFYFS_REAL(remove) }, - { "stat", UNIFYFS_WRAP(stat), &UNIFYFS_REAL(stat) }, - { "__xstat", UNIFYFS_WRAP(__xstat), &UNIFYFS_REAL(__xstat) }, - { "__lxstat", UNIFYFS_WRAP(__lxstat), &UNIFYFS_REAL(__lxstat) }, - { "creat", UNIFYFS_WRAP(creat), &UNIFYFS_REAL(creat) }, - { "creat64", UNIFYFS_WRAP(creat64), &UNIFYFS_REAL(creat64) }, - { "open", UNIFYFS_WRAP(open), &UNIFYFS_REAL(open) }, - { "open64", UNIFYFS_WRAP(open64), &UNIFYFS_REAL(open64) }, - { "__open_2", UNIFYFS_WRAP(__open_2), &UNIFYFS_REAL(__open_2) }, - { "lio_listio", UNIFYFS_WRAP(lio_listio), &UNIFYFS_REAL(lio_listio) }, - { "lseek", UNIFYFS_WRAP(lseek), &UNIFYFS_REAL(lseek) }, - { "lseek64", UNIFYFS_WRAP(lseek64), &UNIFYFS_REAL(lseek64) }, - { "posix_fadvise", UNIFYFS_WRAP(posix_fadvise), &UNIFYFS_REAL(posix_fadvise) }, - { "read", UNIFYFS_WRAP(read), &UNIFYFS_REAL(read) }, - { "write", UNIFYFS_WRAP(write), &UNIFYFS_REAL(write) }, - { "readv", UNIFYFS_WRAP(readv), &UNIFYFS_REAL(readv) }, - { "writev", UNIFYFS_WRAP(writev), &UNIFYFS_REAL(writev) }, - { "pread", UNIFYFS_WRAP(pread), &UNIFYFS_REAL(pread) }, - { "pread64", UNIFYFS_WRAP(pread64), &UNIFYFS_REAL(pread64) }, - { "pwrite", UNIFYFS_WRAP(pwrite), &UNIFYFS_REAL(pwrite) }, - { "pwrite64", UNIFYFS_WRAP(pwrite64), &UNIFYFS_REAL(pwrite64) }, - { "ftruncate", UNIFYFS_WRAP(ftruncate), &UNIFYFS_REAL(ftruncate) }, - { "fsync", UNIFYFS_WRAP(fsync), &UNIFYFS_REAL(fsync) }, - { "fdatasync", UNIFYFS_WRAP(fdatasync), &UNIFYFS_REAL(fdatasync) }, - { "flock", UNIFYFS_WRAP(flock), &UNIFYFS_REAL(flock) }, - { "mmap", UNIFYFS_WRAP(mmap), &UNIFYFS_REAL(mmap) }, - { "msync", UNIFYFS_WRAP(msync), &UNIFYFS_REAL(msync) }, - { "munmap", UNIFYFS_WRAP(munmap), &UNIFYFS_REAL(munmap) }, - { "mmap64", UNIFYFS_WRAP(mmap64), &UNIFYFS_REAL(mmap64) }, - { "__fxstat", UNIFYFS_WRAP(__fxstat), &UNIFYFS_REAL(__fxstat) }, - { "close", UNIFYFS_WRAP(close), &UNIFYFS_REAL(close) }, - { "opendir", UNIFYFS_WRAP(opendir), &UNIFYFS_REAL(opendir) }, - { "fdopendir", UNIFYFS_WRAP(fdopendir), &UNIFYFS_REAL(fdopendir) }, - { "closedir", UNIFYFS_WRAP(closedir), &UNIFYFS_REAL(closedir) }, - { "readdir", UNIFYFS_WRAP(readdir), &UNIFYFS_REAL(readdir) }, - { "rewinddir", UNIFYFS_WRAP(rewinddir), &UNIFYFS_REAL(rewinddir) }, - { "dirfd", UNIFYFS_WRAP(dirfd), &UNIFYFS_REAL(dirfd) }, - { "telldir", UNIFYFS_WRAP(telldir), &UNIFYFS_REAL(telldir) }, - { "scandir", UNIFYFS_WRAP(scandir), &UNIFYFS_REAL(scandir) }, - { "seekdir", UNIFYFS_WRAP(seekdir), &UNIFYFS_REAL(seekdir) }, - { "fopen", UNIFYFS_WRAP(fopen), &UNIFYFS_REAL(fopen) }, - { "freopen", UNIFYFS_WRAP(freopen), &UNIFYFS_REAL(freopen) }, - { "setvbuf", UNIFYFS_WRAP(setvbuf), &UNIFYFS_REAL(setvbuf) }, - { "setbuf", UNIFYFS_WRAP(setbuf), &UNIFYFS_REAL(setbuf) }, - { "ungetc", UNIFYFS_WRAP(ungetc), &UNIFYFS_REAL(ungetc) }, - { "fgetc", UNIFYFS_WRAP(fgetc), &UNIFYFS_REAL(fgetc) }, - { "fputc", UNIFYFS_WRAP(fputc), &UNIFYFS_REAL(fputc) }, - { "getc", UNIFYFS_WRAP(getc), &UNIFYFS_REAL(getc) }, - { "putc", UNIFYFS_WRAP(putc), &UNIFYFS_REAL(putc) }, - { "fgets", UNIFYFS_WRAP(fgets), &UNIFYFS_REAL(fgets) }, - { "fputs", UNIFYFS_WRAP(fputs), &UNIFYFS_REAL(fputs) }, - { "fread", UNIFYFS_WRAP(fread), &UNIFYFS_REAL(fread) }, - { "fwrite", UNIFYFS_WRAP(fwrite), &UNIFYFS_REAL(fwrite) }, - { "fprintf", UNIFYFS_WRAP(fprintf), &UNIFYFS_REAL(fprintf) }, - { "vfprintf", UNIFYFS_WRAP(vfprintf), &UNIFYFS_REAL(vfprintf) }, - { "fscanf", UNIFYFS_WRAP(fscanf), &UNIFYFS_REAL(fscanf) }, - { "vfscanf", UNIFYFS_WRAP(vfscanf), &UNIFYFS_REAL(vfscanf) }, - { "fseek", UNIFYFS_WRAP(fseek), &UNIFYFS_REAL(fseek) }, - { "fseeko", UNIFYFS_WRAP(fseeko), &UNIFYFS_REAL(fseeko) }, - { "ftell", UNIFYFS_WRAP(ftell), &UNIFYFS_REAL(ftell) }, - { "ftello", UNIFYFS_WRAP(ftello), &UNIFYFS_REAL(ftello) }, - { "rewind", UNIFYFS_WRAP(rewind), &UNIFYFS_REAL(rewind) }, - { "fgetpos", UNIFYFS_WRAP(fgetpos), &UNIFYFS_REAL(fgetpos) }, - { "fsetpos", UNIFYFS_WRAP(fsetpos), &UNIFYFS_REAL(fsetpos) }, - { "fflush", UNIFYFS_WRAP(fflush), &UNIFYFS_REAL(fflush) }, - { "feof", UNIFYFS_WRAP(feof), &UNIFYFS_REAL(feof) }, - { "ferror", UNIFYFS_WRAP(ferror), &UNIFYFS_REAL(ferror) }, - { "clearerr", UNIFYFS_WRAP(clearerr), &UNIFYFS_REAL(clearerr) }, - { "fileno", UNIFYFS_WRAP(fileno), &UNIFYFS_REAL(fileno) }, - { "fclose", UNIFYFS_WRAP(fclose), &UNIFYFS_REAL(fclose) }, - { "fwprintf", UNIFYFS_WRAP(fwprintf), &UNIFYFS_REAL(fwprintf) }, - { "fwscanf", UNIFYFS_WRAP(fwscanf), &UNIFYFS_REAL(fwscanf) }, - { "vfwprintf", UNIFYFS_WRAP(vfwprintf), &UNIFYFS_REAL(vfwprintf) }, - { "vfwscanf", UNIFYFS_WRAP(vfwscanf), &UNIFYFS_REAL(vfwscanf) }, - { "fgetwc", UNIFYFS_WRAP(fgetwc), &UNIFYFS_REAL(fgetwc) }, - { "fgetws", UNIFYFS_WRAP(fgetws), &UNIFYFS_REAL(fgetws) }, - { "fputwc", UNIFYFS_WRAP(fputwc), &UNIFYFS_REAL(fputwc) }, - { "fputws", UNIFYFS_WRAP(fputws), &UNIFYFS_REAL(fputws) }, - { "fwide", UNIFYFS_WRAP(fwide), &UNIFYFS_REAL(fwide) }, - { "getwc", UNIFYFS_WRAP(getwc), &UNIFYFS_REAL(getwc) }, - { "putwc", UNIFYFS_WRAP(putwc), &UNIFYFS_REAL(putwc) }, - { "ungetwc", UNIFYFS_WRAP(ungetwc), &UNIFYFS_REAL(ungetwc) }, -}; - -#define GOTCHA_NFUNCS (sizeof(wrap_unifyfs_list) / sizeof(wrap_unifyfs_list[0])) diff --git a/client/src/margo_client.c b/client/src/margo_client.c index ea3676e8f..a8bf18588 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -1,3 +1,17 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + /************************************************************************** * margo_client.c - Implements the client-server RPC calls (shared-memory) **************************************************************************/ @@ -7,80 +21,57 @@ #include "margo_client.h" /* global rpc context */ -client_rpc_context_t* client_rpc_context; // = NULL +static client_rpc_context_t* client_rpc_context; // = NULL /* register client RPCs */ -static void register_client_rpcs(void) +static void register_client_rpcs(client_rpc_context_t* ctx) { - margo_instance_id mid = client_rpc_context->mid; - - client_rpc_context->rpcs.read_id = - MARGO_REGISTER(mid, "unifyfs_read_rpc", - unifyfs_read_in_t, - unifyfs_read_out_t, - NULL); - - client_rpc_context->rpcs.mread_id = - MARGO_REGISTER(mid, "unifyfs_mread_rpc", - unifyfs_mread_in_t, - unifyfs_mread_out_t, - NULL); - - client_rpc_context->rpcs.mount_id = - MARGO_REGISTER(mid, "unifyfs_mount_rpc", - unifyfs_mount_in_t, - unifyfs_mount_out_t, - NULL); - - client_rpc_context->rpcs.unmount_id = - MARGO_REGISTER(mid, "unifyfs_unmount_rpc", - unifyfs_unmount_in_t, - unifyfs_unmount_out_t, - NULL); - - client_rpc_context->rpcs.metaget_id = - MARGO_REGISTER(mid, "unifyfs_metaget_rpc", - unifyfs_metaget_in_t, unifyfs_metaget_out_t, - NULL); - - client_rpc_context->rpcs.metaset_id = - MARGO_REGISTER(mid, "unifyfs_metaset_rpc", - unifyfs_metaset_in_t, unifyfs_metaset_out_t, - NULL); - - client_rpc_context->rpcs.fsync_id = - MARGO_REGISTER(mid, "unifyfs_fsync_rpc", - unifyfs_fsync_in_t, unifyfs_fsync_out_t, - NULL); - - client_rpc_context->rpcs.filesize_id = - MARGO_REGISTER(mid, "unifyfs_filesize_rpc", - unifyfs_filesize_in_t, - unifyfs_filesize_out_t, - NULL); + /* shorter name for our margo instance id */ + margo_instance_id mid = ctx->mid; + + hg_id_t hgid; + +#define CLIENT_REGISTER_RPC(name) \ + do { \ + hgid = MARGO_REGISTER(mid, "unifyfs_" #name "_rpc", \ + unifyfs_##name##_in_t, \ + unifyfs_##name##_out_t, \ + NULL); \ + ctx->rpcs.name##_id = hgid; \ + } while (0) + + CLIENT_REGISTER_RPC(attach); + CLIENT_REGISTER_RPC(mount); + CLIENT_REGISTER_RPC(unmount); + CLIENT_REGISTER_RPC(metaset); + CLIENT_REGISTER_RPC(metaget); + CLIENT_REGISTER_RPC(filesize); + CLIENT_REGISTER_RPC(truncate); + CLIENT_REGISTER_RPC(unlink); + CLIENT_REGISTER_RPC(laminate); + CLIENT_REGISTER_RPC(fsync); + CLIENT_REGISTER_RPC(read); + CLIENT_REGISTER_RPC(mread); + +#undef CLIENT_REGISTER_RPC } /* initialize margo client-server rpc */ int unifyfs_client_rpc_init(void) { - /* initialize margo */ hg_return_t hret; - char addr_self_string[128]; - hg_size_t addr_self_string_sz = sizeof(addr_self_string); - client_rpc_context_t* rpc_ctx; - rpc_ctx = calloc(1, sizeof(client_rpc_context_t)); - if (NULL == rpc_ctx) { - LOGERR("Failed to allocate client RPC context"); - return UNIFYFS_FAILURE; - } - - /* initialize margo */ + /* lookup margo server address string, + * should be something like: "na+sm://7170/0" */ char* svr_addr_string = rpc_lookup_local_server_addr(); if (svr_addr_string == NULL) { LOGERR("Failed to find local margo RPC server address"); return UNIFYFS_FAILURE; } + + /* duplicate server address string, + * then parse address to pick out protocol portion + * which is the piece before the colon like: "na+sm" */ char* proto = strdup(svr_addr_string); char* colon = strchr(proto, ':'); if (NULL != colon) { @@ -88,43 +79,71 @@ int unifyfs_client_rpc_init(void) } LOGDBG("svr_addr:'%s' proto:'%s'", svr_addr_string, proto); - rpc_ctx->mid = margo_init(proto, MARGO_SERVER_MODE, 1, 1); - assert(rpc_ctx->mid); - free(proto); - margo_diag_start(rpc_ctx->mid); + /* allocate memory for rpc context struct */ + client_rpc_context_t* ctx = calloc(1, sizeof(client_rpc_context_t)); + if (NULL == ctx) { + LOGERR("Failed to allocate client RPC context"); + free(proto); + free(svr_addr_string); + return UNIFYFS_FAILURE; + } + + /* initialize margo */ + ctx->mid = margo_init(proto, MARGO_SERVER_MODE, 1, 1); + assert(ctx->mid); + + /* TODO: want to keep this enabled all the time */ + /* what's this do? */ + margo_diag_start(ctx->mid); /* get server margo address */ - rpc_ctx->svr_addr = HG_ADDR_NULL; - margo_addr_lookup(rpc_ctx->mid, svr_addr_string, - &(rpc_ctx->svr_addr)); + ctx->svr_addr = HG_ADDR_NULL; + margo_addr_lookup(ctx->mid, svr_addr_string, &(ctx->svr_addr)); + + /* done with the protocol and address strings, free them */ + free(proto); free(svr_addr_string); - if (rpc_ctx->svr_addr == HG_ADDR_NULL) { + + /* check that we got a valid margo address for the server */ + if (ctx->svr_addr == HG_ADDR_NULL) { LOGERR("Failed to resolve margo server RPC address"); - free(rpc_ctx); + margo_finalize(ctx->mid); + free(ctx); return UNIFYFS_FAILURE; } - /* get client margo address */ - hret = margo_addr_self(rpc_ctx->mid, &(rpc_ctx->client_addr)); + /* get our own margo address */ + hret = margo_addr_self(ctx->mid, &(ctx->client_addr)); if (hret != HG_SUCCESS) { - LOGERR("margo_addr_self()"); - margo_finalize(rpc_ctx->mid); - free(rpc_ctx); + LOGERR("Failed to acquire our margo address"); + margo_addr_free(ctx->mid, ctx->svr_addr); + margo_finalize(ctx->mid); + free(ctx); return UNIFYFS_FAILURE; } - hret = margo_addr_to_string(rpc_ctx->mid, addr_self_string, - &addr_self_string_sz, rpc_ctx->client_addr); + /* convert our margo address to a string */ + char addr_self_string[128]; + hg_size_t addr_self_string_sz = sizeof(addr_self_string); + hret = margo_addr_to_string(ctx->mid, + addr_self_string, &addr_self_string_sz, ctx->client_addr); if (hret != HG_SUCCESS) { - LOGERR("margo_addr_to_string()"); - margo_finalize(rpc_ctx->mid); - free(rpc_ctx); + LOGERR("Failed to convert our margo address to string"); + margo_addr_free(ctx->mid, ctx->client_addr); + margo_addr_free(ctx->mid, ctx->svr_addr); + margo_finalize(ctx->mid); + free(ctx); return UNIFYFS_FAILURE; } - rpc_ctx->client_addr_str = strdup(addr_self_string); - client_rpc_context = rpc_ctx; - register_client_rpcs(); + /* make a copy of our own margo address string */ + ctx->client_addr_str = strdup(addr_self_string); + + /* look up and record id values for each rpc */ + register_client_rpcs(ctx); + + /* cache context in global variable */ + client_rpc_context = ctx; return UNIFYFS_SUCCESS; } @@ -148,365 +167,646 @@ int unifyfs_client_rpc_finalize(void) /* shut down margo */ margo_finalize(ctx->mid); - /* free memory allocated for context structure, - * and set caller's pointer to NULL */ + /* free memory allocated for context structure */ free(ctx->client_addr_str); free(ctx); } + return UNIFYFS_SUCCESS; } -/* invokes the mount rpc function by calling unifyfs_sync_to_del */ -int invoke_client_mount_rpc(void) +/* create and return a margo handle for given rpc id */ +static hg_handle_t create_handle(hg_id_t id) { - hg_handle_t handle; - unifyfs_mount_in_t in; - unifyfs_mount_out_t out; - hg_return_t hret; - int32_t ret; + /* define a temporary to refer to global context */ + client_rpc_context_t* ctx = client_rpc_context; + + /* create handle for specified rpc */ + hg_handle_t handle = HG_HANDLE_NULL; + hg_return_t hret = margo_create(ctx->mid, ctx->svr_addr, id, &handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_create() failed"); + } + return handle; +} + +/* invokes the attach rpc function */ +int invoke_client_attach_rpc(void) +{ + /* check that we have initialized margo */ + if (NULL == client_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.attach_id); + + /* fill in input struct */ + unifyfs_attach_in_t in; + fill_client_attach_info(&in); + + /* call rpc function */ + LOGDBG("invoking the attach rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_attach_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); + if (NULL != in.logio_spill_dir) { + free((void*)in.logio_spill_dir); + } + + return ret; +} +/* invokes the mount rpc function */ +int invoke_client_mount_rpc(void) +{ + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.mount_id, &handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.mount_id); /* fill in input struct */ + unifyfs_mount_in_t in; fill_client_mount_info(&in); + + /* pass our margo address to the server */ in.client_addr_str = strdup(client_rpc_context->client_addr_str); + /* call rpc function */ LOGDBG("invoking the mount rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); - free((void*)in.external_spill_dir); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* free memory on input struct */ + free((void*)in.mount_prefix); free((void*)in.client_addr_str); /* decode response */ + int ret; + unifyfs_mount_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); - - unifyfs_key_slice_range = out.max_recs_per_slice; - LOGDBG("set unifyfs_key_slice_range=%zu", unifyfs_key_slice_range); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + if (ret == (int)UNIFYFS_SUCCESS) { + /* get assigned client id, and verify app_id */ + unifyfs_client_id = (int) out.client_id; + int srvr_app_id = (int) out.app_id; + if (unifyfs_app_id != srvr_app_id) { + LOGWARN("mismatch on app_id - using %d, server returned %d", + unifyfs_app_id, srvr_app_id); + } + LOGDBG("My client id is %d", unifyfs_client_id); + } + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } - margo_free_output(handle, &out); + /* free resources */ margo_destroy(handle); + return ret; } /* function invokes the unmount rpc */ int invoke_client_unmount_rpc(void) { - hg_handle_t handle; - unifyfs_unmount_in_t in; - unifyfs_unmount_out_t out; - hg_return_t hret; - int32_t ret; - + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.unmount_id, - &handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.unmount_id); /* fill in input struct */ - in.app_id = app_id; - in.local_rank_idx = local_rank_idx; + unifyfs_unmount_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + /* call rpc function */ LOGDBG("invoking the unmount rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } /* decode response */ + int ret; + unifyfs_unmount_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } - margo_free_output(handle, &out); + /* free resources */ margo_destroy(handle); - return (int)ret; + + return ret; } -/* invokes the client metaset rpc function */ -int invoke_client_metaset_rpc(unifyfs_file_attr_t* f_meta) +/* + * Set the metadata values for a file (after optionally creating it). + * The gfid for the file is in f_meta->gfid. + * + * create: If set to 1, attempt to create the file first. If the file + * already exists, then update its metadata with the values in + * f_meta. If set to 0, and the file does not exist, then + * the server will return an error. + * + * f_meta: The metadata values to update. + */ +int invoke_client_metaset_rpc(unifyfs_file_attr_op_e attr_op, + unifyfs_file_attr_t* f_meta) { - hg_handle_t handle; - unifyfs_metaset_in_t in; - unifyfs_metaset_out_t out; - hg_return_t hret; - int32_t ret; - + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.metaset_id, - &handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.metaset_id); /* fill in input struct */ - in.fid = f_meta->fid; - in.gfid = f_meta->gfid; - in.filename = f_meta->filename; - in.mode = f_meta->mode; - in.uid = f_meta->uid; - in.gid = f_meta->gid; - in.size = f_meta->size; - in.atime = f_meta->atime; - in.mtime = f_meta->mtime; - in.ctime = f_meta->ctime; - in.is_laminated = f_meta->is_laminated; - - LOGDBG("invoking the metaset rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + unifyfs_metaset_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.attr_op = (int32_t) attr_op; + memcpy(&(in.attr), f_meta, sizeof(*f_meta)); + + /* call rpc function */ + LOGDBG("invoking the metaset rpc function in client - gfid:%d file:%s", + in.attr.gfid, in.attr.filename); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } /* decode response */ + int ret; + unifyfs_metaset_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } - margo_free_output(handle, &out); + /* free resources */ margo_destroy(handle); - return (int)ret; + + return ret; } /* invokes the client metaget rpc function */ -int invoke_client_metaget_rpc(int gfid, - unifyfs_file_attr_t* file_meta) +int invoke_client_metaget_rpc(int gfid, unifyfs_file_attr_t* file_meta) { - hg_handle_t handle; + /* check that we have initialized margo */ + if (NULL == client_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.metaget_id); + + /* fill in input struct */ unifyfs_metaget_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t)gfid; + + /* call rpc function */ + LOGDBG("invoking the metaget rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; unifyfs_metaget_out_t out; - hg_return_t hret; - int32_t ret; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + if (ret == (int)UNIFYFS_SUCCESS) { + /* fill in results */ + memset(file_meta, 0, sizeof(unifyfs_file_attr_t)); + *file_meta = out.attr; + if (NULL != out.attr.filename) { + file_meta->filename = strdup(out.attr.filename); + } + } + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + /* free resources */ + margo_destroy(handle); + + return ret; +} + +/* invokes the client filesize rpc function */ +int invoke_client_filesize_rpc(int gfid, size_t* outsize) +{ + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.metaget_id, - &handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.filesize_id); /* fill in input struct */ - in.gfid = (int32_t)gfid; - LOGDBG("invoking the metaget rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + unifyfs_filesize_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t) gfid; + + /* call rpc function */ + LOGDBG("invoking the filesize rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } /* decode response */ + int ret; + unifyfs_filesize_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); - - if (ret == (int32_t)UNIFYFS_SUCCESS) { - /* fill in results */ - memset(file_meta, 0, sizeof(unifyfs_file_attr_t)); - strcpy(file_meta->filename, out.filename); - file_meta->gfid = gfid; - file_meta->mode = out.mode; - file_meta->uid = out.uid; - file_meta->gid = out.gid; - file_meta->size = out.size; - file_meta->atime = out.atime; - file_meta->mtime = out.mtime; - file_meta->ctime = out.ctime; - file_meta->is_laminated = out.is_laminated; - } - - margo_free_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + if (ret == (int)UNIFYFS_SUCCESS) { + *outsize = (size_t) out.filesize; + } + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ margo_destroy(handle); - return (int)ret; + + return ret; } -/* invokes the client fsync rpc function */ -int invoke_client_fsync_rpc(int gfid) +/* invokes the client truncate rpc function */ +int invoke_client_truncate_rpc(int gfid, size_t filesize) { - hg_handle_t handle; - unifyfs_fsync_in_t in; - unifyfs_fsync_out_t out; - hg_return_t hret; - int32_t ret; - + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.fsync_id, - &handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.truncate_id); /* fill in input struct */ - in.app_id = (int32_t)app_id; - in.local_rank_idx = (int32_t)local_rank_idx; - in.gfid = (int32_t)gfid; + unifyfs_truncate_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t) gfid; + in.filesize = (hg_size_t) filesize; - LOGDBG("invoking the fsync rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + /* call rpc function */ + LOGDBG("invoking the truncate rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } /* decode response */ + int ret; + unifyfs_truncate_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } - margo_free_output(handle, &out); + /* free resources */ margo_destroy(handle); - return (int)ret; + + return ret; } -/* invokes the client filesize rpc function */ -int invoke_client_filesize_rpc(int gfid, - size_t* outsize) +/* invokes the client unlink rpc function */ +int invoke_client_unlink_rpc(int gfid) { - int32_t ret; - hg_handle_t handle; + /* check that we have initialized margo */ + if (NULL == client_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.unlink_id); + + /* fill in input struct */ + unifyfs_unlink_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t) gfid; + + /* call rpc function */ + LOGDBG("invoking the unlink rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_unlink_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); + return ret; +} + +/* invokes the client-to-server laminate rpc function */ +int invoke_client_laminate_rpc(int gfid) +{ + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } /* get handle to rpc function */ - hg_return_t hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.filesize_id, - &handle); - assert(hret == HG_SUCCESS); + hg_handle_t handle = create_handle(client_rpc_context->rpcs.laminate_id); /* fill in input struct */ - unifyfs_filesize_in_t in; - in.app_id = (int32_t)app_id; - in.local_rank_idx = (int32_t)local_rank_idx; - in.gfid = (int32_t)gfid; + unifyfs_laminate_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t) gfid; /* call rpc function */ - LOGDBG("invoking the filesize rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + LOGDBG("invoking the laminate rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } /* decode response */ - unifyfs_filesize_out_t out; + int ret; + unifyfs_laminate_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIu32, ret); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); - /* save output from function */ - *outsize = (size_t) out.filesize; + return ret; +} + +/* invokes the client sync rpc function */ +int invoke_client_sync_rpc(int gfid) +{ + /* check that we have initialized margo */ + if (NULL == client_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.fsync_id); + + /* fill in input struct */ + unifyfs_fsync_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t) gfid; + + /* call rpc function */ + LOGDBG("invoking the sync rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_fsync_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } /* free resources */ - margo_free_output(handle, &out); margo_destroy(handle); - return (int)ret; + + return ret; } /* invokes the client read rpc function */ -int invoke_client_read_rpc(int gfid, - size_t offset, - size_t length) +int invoke_client_read_rpc(int gfid, size_t offset, size_t length) { - hg_handle_t handle; - unifyfs_read_in_t in; - unifyfs_read_out_t out; - hg_return_t hret; - int32_t ret; - + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - /* fill in input struct */ - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.read_id, - &handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.read_id); /* fill in input struct */ - in.app_id = (int32_t)app_id; - in.local_rank_idx = (int32_t)local_rank_idx; - in.gfid = (int32_t)gfid; - in.offset = (hg_size_t)offset; - in.length = (hg_size_t)length; + unifyfs_read_in_t in; + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.gfid = (int32_t) gfid; + in.offset = (hg_size_t) offset; + in.length = (hg_size_t) length; + /* call rpc function */ LOGDBG("invoking the read rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } /* decode response */ + int ret; + unifyfs_read_out_t out; hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } - margo_free_output(handle, &out); + /* free resources */ margo_destroy(handle); - return (int)ret; + + return ret; +} + +int unifyfs_mread_rpc_status_check(unifyfs_mread_rpc_ctx_t* ctx) +{ + int ret = 0; + int flag = 0; + + if (!ctx) { + return -EINVAL; + } + + ret = margo_test(ctx->req, &flag); + if (ret) { + return -EINVAL; /* assume that the given ctx is invalid */ + } + + /* flag becomes 1 when rpc is complete (otherwise 0) */ + if (flag) { + unifyfs_mread_out_t out; + hg_return_t hret = margo_get_output(ctx->handle, &out); + if (hret == HG_SUCCESS) { + ctx->rpc_ret = out.ret; + margo_free_output(ctx->handle, &out); + } else { + /* we failed to get the correct response from the server and + * assume that the rpc failed. */ + ctx->rpc_ret = UNIFYFS_ERROR_MARGO; + } + + margo_destroy(ctx->handle); + } + + return flag; } /* invokes the client mread rpc function */ -int invoke_client_mread_rpc(int read_count, - size_t size, - void* buffer) +int invoke_client_mread_rpc(int read_count, size_t size, void* buffer, + unifyfs_mread_rpc_ctx_t* ctx) { - hg_handle_t handle; - unifyfs_mread_in_t in; - unifyfs_mread_out_t out; - hg_return_t hret; - int32_t ret; + int ret = UNIFYFS_SUCCESS; + /* check that we have initialized margo */ if (NULL == client_rpc_context) { return UNIFYFS_FAILURE; } - /* fill in input struct */ - hret = margo_create(client_rpc_context->mid, - client_rpc_context->svr_addr, - client_rpc_context->rpcs.mread_id, - &handle); - assert(hret == HG_SUCCESS); + if (NULL == ctx) { + return UNIFYFS_FAILURE; + } - hret = margo_bulk_create(client_rpc_context->mid, 1, &buffer, &size, - HG_BULK_READ_ONLY, &in.bulk_handle); - assert(hret == HG_SUCCESS); + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.mread_id); + margo_request req; - /* fill in input struct */ - in.app_id = (int32_t)app_id; - in.local_rank_idx = (int32_t)local_rank_idx; - in.read_count = (int32_t)read_count; - in.bulk_size = (hg_size_t)size; + unifyfs_mread_in_t in; + hg_return_t hret = margo_bulk_create( + client_rpc_context->mid, 1, &buffer, &size, + HG_BULK_READ_ONLY, &in.bulk_handle); + if (hret != HG_SUCCESS) { + return UNIFYFS_ERROR_MARGO; + } - LOGDBG("invoking the read rpc function in client"); - hret = margo_forward(handle, &in); - assert(hret == HG_SUCCESS); + /* fill in input struct */ + in.app_id = (int32_t) unifyfs_app_id; + in.client_id = (int32_t) unifyfs_client_id; + in.read_count = (int32_t) read_count; + in.bulk_size = (hg_size_t) size; - /* decode response */ - hret = margo_get_output(handle, &out); - assert(hret == HG_SUCCESS); - ret = out.ret; - LOGDBG("Got response ret=%" PRIi32, ret); + /* call rpc function */ + LOGDBG("invoking the mread rpc function in client"); + hret = margo_iforward(handle, &in, &req); + if (HG_SUCCESS == hret) { + ctx->handle = handle; + ctx->req = req; + } else { + LOGERR("margo_iforward() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + /* margo_iforward serializes all data before returning, and it's safe to + * free the rpc params */ margo_bulk_free(in.bulk_handle); - margo_free_output(handle, &out); - margo_destroy(handle); - return (int)ret; + + return ret; } diff --git a/client/src/margo_client.h b/client/src/margo_client.h index d4b0861c7..1c53b6026 100644 --- a/client/src/margo_client.h +++ b/client/src/margo_client.h @@ -1,3 +1,17 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + #ifndef _MARGO_CLIENT_H #define _MARGO_CLIENT_H @@ -10,14 +24,18 @@ #include "unifyfs_client_rpcs.h" typedef struct ClientRpcIds { - hg_id_t filesize_id; - hg_id_t read_id; - hg_id_t mread_id; + hg_id_t attach_id; hg_id_t mount_id; hg_id_t unmount_id; - hg_id_t metaget_id; hg_id_t metaset_id; + hg_id_t metaget_id; + hg_id_t filesize_id; + hg_id_t truncate_id; + hg_id_t unlink_id; + hg_id_t laminate_id; hg_id_t fsync_id; + hg_id_t read_id; + hg_id_t mread_id; } client_rpcs_t; typedef struct ClientRpcContext { @@ -33,28 +51,57 @@ int unifyfs_client_rpc_init(void); int unifyfs_client_rpc_finalize(void); -void fill_client_mount_info(unifyfs_mount_in_t* in); +void fill_client_attach_info(unifyfs_attach_in_t* in); +int invoke_client_attach_rpc(void); +void fill_client_mount_info(unifyfs_mount_in_t* in); int invoke_client_mount_rpc(void); int invoke_client_unmount_rpc(void); -int invoke_client_metaset_rpc(unifyfs_file_attr_t* f_meta); - -int invoke_client_metaget_rpc(int gfid, +int invoke_client_metaset_rpc(unifyfs_file_attr_op_e attr_op, unifyfs_file_attr_t* f_meta); -int invoke_client_fsync_rpc(int gfid); +int invoke_client_metaget_rpc(int gfid, unifyfs_file_attr_t* f_meta); + +int invoke_client_filesize_rpc(int gfid, size_t* filesize); + +int invoke_client_truncate_rpc(int gfid, size_t filesize); + +int invoke_client_unlink_rpc(int gfid); + +int invoke_client_laminate_rpc(int gfid); + +int invoke_client_sync_rpc(int gfid); + +int invoke_client_read_rpc(int gfid, size_t offset, size_t length); + +/* + * mread rpc function is non-blocking (using margo_iforward), and the response + * from the server should be checked by the caller manually using + * the unifyfs_mread_rpc_status_check function. + */ +struct unifyfs_mread_rpc_ctx { + margo_request req; /* margo request for track iforward result */ + hg_handle_t handle; /* rpc handle */ + int rpc_ret; /* rpc response from the server */ +}; + +typedef struct unifyfs_mread_rpc_ctx unifyfs_mread_rpc_ctx_t; -int invoke_client_filesize_rpc(int gfid, - size_t* filesize); +/** + * @brief track the progress of the submitted rpc. if the rpc is done, this + * funtcion returns 1 with the server response being stored in @ctx->rpc_ret. + * + * @param ctx pointer to the rpc ctx + * + * @return 1 if rpc is done (received response from the server), 0 if still in + * progress. -EINVAL if the @ctx is invalid. + */ +int unifyfs_mread_rpc_status_check(unifyfs_mread_rpc_ctx_t* ctx); -int invoke_client_read_rpc(int gfid, - size_t offset, - size_t length); -int invoke_client_mread_rpc(int read_count, - size_t size, - void* buffer); +int invoke_client_mread_rpc(int read_count, size_t size, void* buffer, + unifyfs_mread_rpc_ctx_t* ctx); #endif // MARGO_CLIENT_H diff --git a/client/src/pmpi_wrappers.c b/client/src/pmpi_wrappers.c index ac86d39ba..c0e9c730b 100644 --- a/client/src/pmpi_wrappers.c +++ b/client/src/pmpi_wrappers.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -12,10 +12,12 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + +#include + #include "pmpi_wrappers.h" #include "unifyfs.h" -#include -#include +#include "unifyfs_rc.h" int unifyfs_mpi_init(int* argc, char*** argv) { @@ -37,7 +39,7 @@ int unifyfs_mpi_init(int* argc, char*** argv) rc = unifyfs_mount("/unifyfs", rank, (size_t)world_sz, app_id); if (UNIFYFS_SUCCESS != rc) { fprintf(stderr, "UNIFYFS ERROR: unifyfs_mount() failed with '%s'\n", - unifyfs_error_enum_description((unifyfs_error_e)rc)); + unifyfs_rc_enum_description((unifyfs_rc)rc)); } return ret; @@ -66,7 +68,7 @@ int unifyfs_mpi_finalize(void) rc = unifyfs_unmount(); if (UNIFYFS_SUCCESS != rc) { fprintf(stderr, "UNIFYFS ERROR: unifyfs_unmount() failed with '%s'\n", - unifyfs_error_enum_description((unifyfs_error_e)rc)); + unifyfs_rc_enum_description((unifyfs_rc)rc)); } //fprintf(stderr, "DEBUG: %s - before PMPI_Finalize()\n", __func__); diff --git a/client/src/pmpi_wrappers.h b/client/src/pmpi_wrappers.h index b90057ce4..d2066dd59 100644 --- a/client/src/pmpi_wrappers.h +++ b/client/src/pmpi_wrappers.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -15,6 +15,8 @@ #ifndef UNIFYFS_PMPI_WRAPPERS_H #define UNIFYFS_PMPI_WRAPPERS_H +#include + /* MPI_Init PMPI wrapper */ int unifyfs_mpi_init(int* argc, char*** argv); int MPI_Init(int* argc, char*** argv); diff --git a/client/src/unifyfs-dirops.c b/client/src/unifyfs-dirops.c index 8325152f5..2d0b06946 100644 --- a/client/src/unifyfs-dirops.c +++ b/client/src/unifyfs-dirops.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,7 +11,6 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#include #include "unifyfs-sysio.h" @@ -89,7 +88,8 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) { /* call real opendir and return early if this is * not one of our paths */ - if (!unifyfs_intercept_path(name)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (!unifyfs_intercept_path(name, upath)) { MAP_OR_FAIL(opendir); return UNIFYFS_REAL(opendir)(name); } @@ -99,11 +99,11 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) * if valid, populate the local file meta cache accordingly. */ - int fid = unifyfs_get_fid_from_path(name); - int gfid = unifyfs_generate_gfid(name); + int fid = unifyfs_get_fid_from_path(upath); + int gfid = unifyfs_generate_gfid(upath); unifyfs_file_attr_t gfattr = { 0, }; - int ret = unifyfs_get_global_file_meta(fid, gfid, &gfattr); + int ret = unifyfs_get_global_file_meta(gfid, &gfattr); if (ret != UNIFYFS_SUCCESS) { errno = ENOENT; return NULL; @@ -121,6 +121,7 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) unifyfs_filemeta_t* meta = NULL; if (fid >= 0) { meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); /* * FIXME: We found an inconsistent status between local cache and @@ -128,30 +129,23 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) * re-populate with the global data? */ if (!unifyfs_fid_is_dir(fid)) { - errno = EIO; + errno = ENOTDIR; return NULL; } - - /* - * FIXME: also, is it safe to oeverride this local data? - */ - meta->size = sb.st_size; - meta->chunks = sb.st_blocks; - meta->log_size = 0; /* no need of local storage for dir operations */ } else { - fid = unifyfs_fid_create_file(name); + fid = unifyfs_fid_create_file(upath); if (fid < 0) { - errno = EIO; + errno = unifyfs_rc_errno(-fid); return NULL; } meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); meta->mode = (meta->mode & ~S_IFREG) | S_IFDIR; /* set as directory */ - meta->size = sb.st_size; - meta->chunks = sb.st_blocks; - meta->log_size = 0; } + meta->global_size = sb.st_size; + unifyfs_dirstream_t* dirp = unifyfs_dirstream_alloc(fid); return (DIR*) dirp; @@ -243,7 +237,8 @@ int UNIFYFS_WRAP(scandir)(const char* path, struct dirent** namelist, int (*compar)(const struct dirent**, const struct dirent**)) { - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { fprintf(stderr, "Function not yet supported @ %s:%d\n", __FILE__, __LINE__); errno = ENOSYS; diff --git a/client/src/unifyfs-dirops.h b/client/src/unifyfs-dirops.h index 7694ed492..876360391 100644 --- a/client/src/unifyfs-dirops.h +++ b/client/src/unifyfs-dirops.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2017, 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2017, 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,14 +11,11 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #ifndef __UNIFYFS_DIROPS_H #define __UNIFYFS_DIROPS_H -#include - -#include -#include -#include +#include "unifyfs-internal.h" /* * FIXME: is this portable to use the linux dirent structure? diff --git a/client/src/unifyfs-fixed.c b/client/src/unifyfs-fixed.c index 88b4c72a3..7833dcb83 100644 --- a/client/src/unifyfs-fixed.c +++ b/client/src/unifyfs-fixed.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -40,658 +40,293 @@ * Please also read this file LICENSE.CRUISE */ +#include "unifyfs-internal.h" #include "unifyfs-fixed.h" #include "unifyfs_log.h" - -static inline -unifyfs_chunkmeta_t* filemeta_get_chunkmeta(const unifyfs_filemeta_t* meta, - int cid) -{ - unifyfs_chunkmeta_t* chunkmeta = NULL; - uint64_t limit = 0; - - if (unifyfs_use_memfs) { - limit += unifyfs_max_chunks; - } - - if (unifyfs_use_spillover) { - limit += unifyfs_spillover_max_chunks; - } - - if (meta && (cid >= 0 && cid < limit)) { - chunkmeta = &unifyfs_chunkmetas[meta->chunkmeta_idx + cid]; - } - - return chunkmeta; -} - -/* given a file id and logical chunk id, return pointer to meta data - * for specified chunk, return NULL if not found */ -static inline unifyfs_chunkmeta_t* unifyfs_get_chunkmeta(int fid, int cid) -{ - /* lookup file meta data for specified file id */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - - return filemeta_get_chunkmeta(meta, cid); -} +#include "margo_client.h" +#include "seg_tree.h" /* --------------------------------------- - * Operations on file chunks + * Operations on client write index * --------------------------------------- */ -/* given a logical chunk id and an offset within that chunk, return the pointer - * to the memory location corresponding to that location */ -static inline void* unifyfs_compute_chunk_buf(const unifyfs_filemeta_t* meta, - int cid, off_t offset) -{ - /* get pointer to chunk meta */ - const unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, cid); - - /* identify physical chunk id */ - int physical_id = chunk_meta->id; - - /* compute the start of the chunk */ - char* start = NULL; - if (physical_id < unifyfs_max_chunks) { - start = unifyfs_chunks + ((long)physical_id << unifyfs_chunk_bits); - } else { - /* chunk is in spill over */ - LOGERR("wrong chunk ID"); - return NULL; - } - - /* now add offset */ - char* buf = start + offset; - return (void*)buf; -} - -/* given a chunk id and an offset within that chunk, return the offset - * in the spillover file corresponding to that location */ -static inline off_t unifyfs_compute_spill_offset(const unifyfs_filemeta_t* meta, - int cid, off_t offset) +/* + * Clear all entries in the log index. This only clears the metadata, + * not the data itself. + */ +static void clear_index(void) { - /* get pointer to chunk meta */ - const unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, cid); - - /* identify physical chunk id */ - int physical_id = chunk_meta->id; - - /* compute start of chunk in spill over device */ - off_t start = 0; - if (physical_id < unifyfs_max_chunks) { - LOGERR("wrong spill-chunk ID"); - return -1; - } else { - /* compute buffer loc within spillover device chunk */ - /* account for the unifyfs_max_chunks added to identify location when - * grabbing this chunk */ - start = ((long)(physical_id - unifyfs_max_chunks) << unifyfs_chunk_bits); - } - - off_t buf = start + offset; - return buf; + *unifyfs_indices.ptr_num_entries = 0; } -/* allocate a new chunk for the specified file and logical chunk id */ -static int unifyfs_chunk_alloc(int fid, unifyfs_filemeta_t* meta, int chunk_id) +/* Add the metadata for a single write to the index */ +static int add_write_meta_to_index(unifyfs_filemeta_t* meta, + off_t file_pos, + off_t log_pos, + size_t length) { - /* get pointer to chunk meta data */ - unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, chunk_id); - - /* allocate a chunk and record its location */ - if (unifyfs_use_memfs) { - /* allocate a new chunk from memory */ - unifyfs_stack_lock(); - int id = unifyfs_stack_pop(free_chunk_stack); - unifyfs_stack_unlock(); - - /* if we got one return, otherwise try spill over */ - if (id >= 0) { - /* got a chunk from memory */ - chunk_meta->location = CHUNK_LOCATION_MEMFS; - chunk_meta->id = id; - } else if (unifyfs_use_spillover) { - /* shm segment out of space, grab a block from spill-over device */ - LOGDBG("getting blocks from spill-over device"); - - /* TODO: missing lock calls? */ - /* add unifyfs_max_chunks to identify chunk location */ - unifyfs_stack_lock(); - id = unifyfs_stack_pop(free_spillchunk_stack) + unifyfs_max_chunks; - unifyfs_stack_unlock(); - if (id < unifyfs_max_chunks) { - LOGERR("spill-over device out of space (%d)", id); - return UNIFYFS_ERROR_NOSPC; - } - - /* got one from spill over */ - chunk_meta->location = CHUNK_LOCATION_SPILLOVER; - chunk_meta->id = id; - } else { - /* spill over isn't available, so we're out of space */ - LOGERR("memfs out of space (%d)", id); - return UNIFYFS_ERROR_NOSPC; - } - } else if (unifyfs_use_spillover) { - /* memory file system is not enabled, but spill over is */ - - /* shm segment out of space, grab a block from spill-over device */ - LOGDBG("getting blocks from spill-over device"); - - /* TODO: missing lock calls? */ - /* add unifyfs_max_chunks to identify chunk location */ - unifyfs_stack_lock(); - int id = unifyfs_stack_pop(free_spillchunk_stack) + unifyfs_max_chunks; - unifyfs_stack_unlock(); - if (id < unifyfs_max_chunks) { - LOGERR("spill-over device out of space (%d)", id); - return UNIFYFS_ERROR_NOSPC; - } - - /* got one from spill over */ - chunk_meta->location = CHUNK_LOCATION_SPILLOVER; - chunk_meta->id = id; - } else { - /* don't know how to allocate chunk */ - chunk_meta->location = CHUNK_LOCATION_NULL; - return UNIFYFS_ERROR_IO; + /* add write extent to our segment trees */ + if (unifyfs_local_extents) { + /* record write extent in our local cache */ + seg_tree_add(&meta->extents, + file_pos, + file_pos + length - 1, + log_pos); } - return UNIFYFS_SUCCESS; -} - -static int unifyfs_chunk_free(int fid, unifyfs_filemeta_t* meta, int chunk_id) -{ - /* get pointer to chunk meta data */ - unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, chunk_id); - - /* get physical id of chunk */ - int id = chunk_meta->id; - LOGDBG("free chunk %d from location %d", id, chunk_meta->location); - - /* determine location of chunk */ - if (chunk_meta->location == CHUNK_LOCATION_MEMFS) { - unifyfs_stack_lock(); - unifyfs_stack_push(free_chunk_stack, id); - unifyfs_stack_unlock(); - } else if (chunk_meta->location == CHUNK_LOCATION_SPILLOVER) { - /* TODO: free spill over chunk */ - } else { - /* unkwown chunk location */ - LOGERR("unknown chunk location %d", chunk_meta->location); - return UNIFYFS_ERROR_IO; + /* + * We want to make sure this write will not overflow the maximum + * number of index entries we can sync with server. A write can at most + * create two new nodes in the seg_tree. If we're close to potentially + * filling up the index, sync it out. + */ + unsigned long count_before = seg_tree_count(&meta->extents_sync); + if (count_before >= (unifyfs_max_index_entries - 2)) { + /* this will flush our segments, sync them, and set the running + * segment count back to 0 */ + unifyfs_sync(meta->fid); } - /* update location of chunk */ - chunk_meta->location = CHUNK_LOCATION_NULL; + /* store the write in our segment tree used for syncing with server. */ + seg_tree_add(&meta->extents_sync, + file_pos, + file_pos + length - 1, + log_pos); return UNIFYFS_SUCCESS; } -/* read data from specified chunk id, chunk offset, and count into user buffer, - * count should fit within chunk starting from specified offset */ -static int unifyfs_chunk_read( - unifyfs_filemeta_t* meta, /* pointer to file meta data */ - int chunk_id, /* logical chunk id to read data from */ - off_t chunk_offset, /* logical offset within chunk to read from */ - void* buf, /* buffer to store data to */ - size_t count) /* number of bytes to read */ +/* + * Remove all entries in the current index and re-write it using the write + * metadata stored in the target file's extents_sync segment tree. This only + * re-writes the metadata in the index. All the actual data is still kept + * in the write log and will be referenced correctly by the new metadata. + * + * After this function is done, 'unifyfs_indices' will have been totally + * re-written. The writes in the index will be flattened, non-overlapping, + * and sequential. The extents_sync segment tree will be cleared. + * + * This function is called when we sync our extents with the server. + * + * Returns maximum write log offset for synced extents. + */ +off_t unifyfs_rewrite_index_from_seg_tree(unifyfs_filemeta_t* meta) { - /* get chunk meta data */ - unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, chunk_id); - - /* determine location of chunk */ - if (chunk_meta->location == CHUNK_LOCATION_MEMFS) { - /* just need a memcpy to read data */ - void* chunk_buf = unifyfs_compute_chunk_buf( - meta, chunk_id, chunk_offset); - memcpy(buf, chunk_buf, count); - } else if (chunk_meta->location == CHUNK_LOCATION_SPILLOVER) { - /* spill over to a file, so read from file descriptor */ - //MAP_OR_FAIL(pread); - off_t spill_offset = unifyfs_compute_spill_offset(meta, chunk_id, chunk_offset); - ssize_t rc = pread(unifyfs_spilloverblock, buf, count, spill_offset); - if (rc < 0) { - return unifyfs_errno_map_to_err(rc); + /* get pointer to index buffer */ + unifyfs_index_t* indexes = unifyfs_indices.index_entry; + + /* Erase the index before we re-write it */ + clear_index(); + + /* count up number of entries we wrote to buffer */ + unsigned long idx = 0; + + /* record maximum write log offset */ + off_t max_log_offset = 0; + + int gfid = meta->gfid; + + seg_tree_rdlock(&meta->extents_sync); + /* For each write in this file's seg_tree ... */ + struct seg_tree_node* node = NULL; + while ((node = seg_tree_iter(&meta->extents_sync, node))) { + indexes[idx].file_pos = node->start; + indexes[idx].log_pos = node->ptr; + indexes[idx].length = node->end - node->start + 1; + indexes[idx].gfid = gfid; + idx++; + if ((off_t)(node->end) > max_log_offset) { + max_log_offset = (off_t) node->end; } - } else { - /* unknown chunk type */ - LOGERR("unknown chunk type"); - return UNIFYFS_ERROR_IO; } + seg_tree_unlock(&meta->extents_sync); + /* All done processing this files writes. Clear its seg_tree */ + seg_tree_clear(&meta->extents_sync); - /* assume read was successful if we get to here */ - return UNIFYFS_SUCCESS; + /* record total number of entries in index buffer */ + *unifyfs_indices.ptr_num_entries = idx; + + return max_log_offset; } -/* given an index, split it into multiple indices whose range is equal or - * smaller than slice_range size @param cur_idx: the index to split - * @param slice_range: the slice size of the key-value store - * @return index_set: the set of split indices */ -int unifyfs_split_index(unifyfs_index_t* cur_idx, index_set_t* index_set, - long slice_range) +/* + * Find any write extents that span or exceed truncation point and remove them. + * + * This function is called when we truncate a file and there are cached writes. + */ +int truncate_write_meta(unifyfs_filemeta_t* meta, off_t trunc_sz) { + if (0 == trunc_sz) { + /* All writes should be removed. Clear extents_sync */ + seg_tree_clear(&meta->extents_sync); - long cur_idx_start = cur_idx->file_pos; - long cur_idx_end = cur_idx->file_pos + cur_idx->length - 1; - - long cur_slice_start = cur_idx->file_pos / slice_range * slice_range; - long cur_slice_end = cur_slice_start + slice_range - 1; - - - index_set->count = 0; - - long cur_mem_pos = cur_idx->mem_pos; - if (cur_idx_end <= cur_slice_end) { - /* - cur_slice_start cur_slice_end - cur_idx_start cur_idx_end - - */ - index_set->idxes[index_set->count] = *cur_idx; - index_set->count++; - - } else { - /* - cur_slice_start cur_slice_endnext_slice_start next_slice_end - cur_idx_start cur_idx_end - - */ - index_set->idxes[index_set->count] = *cur_idx; - index_set->idxes[index_set->count].length = - cur_slice_end - cur_idx_start + 1; - - cur_mem_pos += index_set->idxes[index_set->count].length; - - cur_slice_start = cur_slice_end + 1; - cur_slice_end = cur_slice_start + slice_range - 1; - index_set->count++; - - while (1) { - if (cur_idx_end <= cur_slice_end) { - break; - } - - index_set->idxes[index_set->count].fid = cur_idx->fid; - index_set->idxes[index_set->count].file_pos = cur_slice_start; - index_set->idxes[index_set->count].length = slice_range; - index_set->idxes[index_set->count].mem_pos = cur_mem_pos; - cur_mem_pos += index_set->idxes[index_set->count].length; - - cur_slice_start = cur_slice_end + 1; - cur_slice_end = cur_slice_start + slice_range - 1; - index_set->count++; - + if (unifyfs_local_extents) { + /* Clear the local extent cache too */ + seg_tree_clear(&meta->extents); } - - index_set->idxes[index_set->count].fid = cur_idx->fid; - index_set->idxes[index_set->count].file_pos = cur_slice_start; - index_set->idxes[index_set->count].length = cur_idx_end - cur_slice_start + 1; - index_set->idxes[index_set->count].mem_pos = cur_mem_pos; - index_set->count++; + return UNIFYFS_SUCCESS; } - return 0; + unsigned long trunc_off = (unsigned long) trunc_sz; + int rc = seg_tree_remove(&meta->extents_sync, trunc_off, ULONG_MAX); + if (unifyfs_local_extents) { + rc = seg_tree_remove(&meta->extents, trunc_off, ULONG_MAX); + } + if (rc) { + LOGERR("removal of write extents due to truncation failed"); + rc = UNIFYFS_FAILURE; + } else { + rc = UNIFYFS_SUCCESS; + } + return rc; } -/* read data from specified chunk id, chunk offset, and count into user buffer, - * count should fit within chunk starting from specified offset */ -static int unifyfs_logio_chunk_write( - int fid, /* local file id */ - long pos, /* write offset inside the file */ - unifyfs_filemeta_t* meta, /* pointer to file meta data */ - int chunk_id, /* logical chunk id to write to */ - off_t chunk_offset, /* logical offset within chunk to write to */ - const void* buf, /* buffer holding data to be written */ - size_t count) /* number of bytes to write */ -{ - /* get chunk meta data */ - unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, chunk_id); - - if (chunk_meta->location != CHUNK_LOCATION_MEMFS && - chunk_meta->location != CHUNK_LOCATION_SPILLOVER) { - /* unknown chunk type */ - LOGERR("unknown chunk type"); - return UNIFYFS_ERROR_IO; - } - /* determine location of chunk */ - off_t log_offset = 0; - if (chunk_meta->location == CHUNK_LOCATION_MEMFS) { - /* just need a memcpy to write data */ - char* chunk_buf = unifyfs_compute_chunk_buf( - meta, chunk_id, chunk_offset); - memcpy(chunk_buf, buf, count); - - log_offset = chunk_buf - unifyfs_chunks; - } else if (chunk_meta->location == CHUNK_LOCATION_SPILLOVER) { - /* spill over to a file, so write to file descriptor */ - //MAP_OR_FAIL(pwrite); - off_t spill_offset = unifyfs_compute_spill_offset(meta, chunk_id, chunk_offset); - ssize_t rc = __real_pwrite(unifyfs_spilloverblock, buf, count, spill_offset); - if (rc < 0) { - LOGERR("pwrite failed: errno=%d (%s)", errno, strerror(errno)); +/* + * Sync all the write extents for the target file(s) to the server. + * The target_fid identifies a specific file, or all files (-1). + * Clears the metadata index afterwards. + * + * Returns 0 on success, nonzero otherwise. + */ +int unifyfs_sync(int target_fid) +{ + int tmp_rc; + int ret = UNIFYFS_SUCCESS; + + /* if caller gave us a file id, sync that specific fid */ + if (target_fid >= 0) { + /* user named a specific file id, lookup its metadata */ + int fid = target_fid; + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if ((NULL == meta) || (meta->fid != fid)) { + /* bail out with an error if we fail to find it */ + LOGERR("missing filemeta for fid=%d", fid); + return UNIFYFS_FAILURE; } - log_offset = spill_offset + unifyfs_max_chunks * (1 << unifyfs_chunk_bits); - } + /* sync with server if we need to */ + if (meta->needs_sync) { + /* write contents from segment tree to index buffer */ + off_t max_log_off = unifyfs_rewrite_index_from_seg_tree(meta); - /* find the corresponding file attr entry and update attr*/ - unifyfs_file_attr_t tmp_meta_entry; - tmp_meta_entry.fid = fid; - unifyfs_file_attr_t* ptr_meta_entry - = (unifyfs_file_attr_t*)bsearch(&tmp_meta_entry, - unifyfs_fattrs.meta_entry, - *unifyfs_fattrs.ptr_num_entries, - sizeof(unifyfs_file_attr_t), - compare_fattr); - if (ptr_meta_entry != NULL) { - ptr_meta_entry->size = pos + count; - } + /* if there are no index entries, we've got nothing to sync */ + if (*unifyfs_indices.ptr_num_entries == 0) { + /* consider that we've sync'd successfully */ + meta->needs_sync = 0; + return UNIFYFS_SUCCESS; + } - /* define an new index entry for this write operation */ - unifyfs_index_t cur_idx; - cur_idx.fid = ptr_meta_entry->gfid; - cur_idx.file_pos = pos; - cur_idx.mem_pos = log_offset; - cur_idx.length = count; - - /* split the write requests larger than unifyfs_key_slice_range into - * the ones smaller than unifyfs_key_slice_range - * */ - index_set_t tmp_index_set; - memset(&tmp_index_set, 0, sizeof(tmp_index_set)); - unifyfs_split_index(&cur_idx, &tmp_index_set, - unifyfs_key_slice_range); - - /* lookup number of existing index entries */ - off_t num_entries = *(unifyfs_indices.ptr_num_entries); - - /* number of new entries we may add */ - off_t tmp_entries = (off_t) tmp_index_set.count; - - /* check whether there is room to add new entries */ - if (num_entries + tmp_entries < unifyfs_max_index_entries) { - /* get pointer to index array */ - unifyfs_index_t* idxs = unifyfs_indices.index_entry; - - /* coalesce contiguous indices */ - int i = 0; - if (num_entries > 0) { - /* pointer to last element in index array */ - unifyfs_index_t* prev_idx = &idxs[num_entries - 1]; - - /* pointer to first element in temp list */ - unifyfs_index_t* next_idx = &tmp_index_set.idxes[0]; - - /* offset of last byte for last index in list */ - off_t prev_offset = prev_idx->file_pos + prev_idx->length; - - /* check whether last index and temp index refer to - * contiguous bytes in the same file */ - if (prev_idx->fid == next_idx->fid && - prev_offset == next_idx->file_pos) { - /* got contiguous bytes in the same file, - * check if both index values fall in the same slice */ - off_t prev_slice = prev_idx->file_pos / unifyfs_key_slice_range; - off_t next_slice = next_idx->file_pos / unifyfs_key_slice_range; - if (prev_slice == next_slice) { - /* index values also are in same slice, - * so append first index in temp list to - * last index in list */ - prev_idx->length += next_idx->length; - - /* advance to next index in temp list */ - i++; + /* ensure any data written to the spillover file is flushed */ + off_t logio_shmem_size; + unifyfs_logio_get_sizes(logio_ctx, &logio_shmem_size, NULL); + if (max_log_off >= logio_shmem_size) { + /* some extents range into spill over area, + * so flush data to spill over file */ + tmp_rc = unifyfs_logio_sync(logio_ctx); + if (UNIFYFS_SUCCESS != tmp_rc) { + LOGERR("failed to sync logio data"); + ret = tmp_rc; } + LOGDBG("after logio spill sync"); } - } - /* pointer to temp index list */ - unifyfs_index_t* newidxs = tmp_index_set.idxes; + /* tell the server to grab our new extents */ + tmp_rc = invoke_client_sync_rpc(meta->gfid); + if (UNIFYFS_SUCCESS != tmp_rc) { + /* something went wrong when trying to flush extents */ + LOGERR("failed to flush write index to server for gfid=%d", + meta->gfid); + ret = tmp_rc; + } - /* copy remaining items in temp index list to index list */ - while (i < tmp_index_set.count) { - /* copy index fields */ - idxs[num_entries].fid = newidxs[i].fid; - idxs[num_entries].file_pos = newidxs[i].file_pos; - idxs[num_entries].mem_pos = newidxs[i].mem_pos; - idxs[num_entries].length = newidxs[i].length; + /* we've sync'd, so mark this file as being up-to-date */ + meta->needs_sync = 0; - /* advance to next element in each list */ - num_entries++; - i++; + /* flushed, clear buffer and refresh number of entries + * and number remaining */ + clear_index(); } - /* update number of entries in index array */ - (*unifyfs_indices.ptr_num_entries) = num_entries; - } else { - /* TODO: no room to write additional index metadata entries, - * swap out existing metadata buffer to disk*/ - printf("exhausted metadata"); + return ret; } - /* assume read was successful if we get to here */ - return UNIFYFS_SUCCESS; -} - -/* read data from specified chunk id, chunk offset, and count into user buffer, - * count should fit within chunk starting from specified offset */ -static int unifyfs_chunk_write( - unifyfs_filemeta_t* meta, /* pointer to file meta data */ - int chunk_id, /* logical chunk id to write to */ - off_t chunk_offset, /* logical offset within chunk to write to */ - const void* buf, /* buffer holding data to be written */ - size_t count) /* number of bytes to write */ -{ - /* get chunk meta data */ - unifyfs_chunkmeta_t* chunk_meta = filemeta_get_chunkmeta(meta, chunk_id); - - /* determine location of chunk */ - if (chunk_meta->location == CHUNK_LOCATION_MEMFS) { - /* just need a memcpy to write data */ - void* chunk_buf = unifyfs_compute_chunk_buf( - meta, chunk_id, chunk_offset); - memcpy(chunk_buf, buf, count); -// _intel_fast_memcpy(chunk_buf, buf, count); -// unifyfs_memcpy(chunk_buf, buf, count); - } else if (chunk_meta->location == CHUNK_LOCATION_SPILLOVER) { - /* spill over to a file, so write to file descriptor */ - //MAP_OR_FAIL(pwrite); - off_t spill_offset = unifyfs_compute_spill_offset(meta, chunk_id, chunk_offset); - ssize_t rc = pwrite(unifyfs_spilloverblock, buf, count, spill_offset); - if (rc < 0) { - LOGERR("pwrite failed: errno=%d (%s)", errno, strerror(errno)); + /* to get here, caller specified target_fid = -1, + * so sync every file descriptor */ + for (int i = 0; i < UNIFYFS_MAX_FILEDESCS; i++) { + /* get file id for each file descriptor */ + int fid = unifyfs_fds[i].fid; + if (-1 == fid) { + /* file descriptor is not currently in use */ + continue; } - /* TODO: check return code for errors */ - } else { - /* unknown chunk type */ - LOGERR("unknown chunk type"); - return UNIFYFS_ERROR_IO; + /* got an open file, sync this file id */ + tmp_rc = unifyfs_sync(fid); + if (UNIFYFS_SUCCESS != tmp_rc) { + ret = tmp_rc; + } } - /* assume read was successful if we get to here */ - return UNIFYFS_SUCCESS; + return ret; } /* --------------------------------------- * Operations on file storage * --------------------------------------- */ -/* if length is greater than reserved space, reserve space up to length */ -int unifyfs_fid_store_fixed_extend(int fid, unifyfs_filemeta_t* meta, - off_t length) -{ - /* determine whether we need to allocate more chunks */ - off_t maxsize = meta->chunks << unifyfs_chunk_bits; - if (length > maxsize) { - /* compute number of additional bytes we need */ - off_t additional = length - maxsize; - while (additional > 0) { - /* check that we don't overrun max number of chunks for file */ - if (meta->chunks == unifyfs_max_chunks + unifyfs_spillover_max_chunks) { - return UNIFYFS_ERROR_NOSPC; - } - - /* allocate a new chunk */ - int rc = unifyfs_chunk_alloc(fid, meta, meta->chunks); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to allocate chunk"); - return UNIFYFS_ERROR_NOSPC; - } - - /* increase chunk count and subtract bytes from the number we need */ - meta->chunks++; - additional -= unifyfs_chunk_size; - } - } - - return UNIFYFS_SUCCESS; -} - -/* if length is shorter than reserved space, give back space down to length */ -int unifyfs_fid_store_fixed_shrink(int fid, unifyfs_filemeta_t* meta, - off_t length) +/** + * Write data to file using log-based I/O + * + * @param fid file id to write to + * @param meta metadata for file + * @param pos file position to start writing at + * @param buf user buffer holding data + * @param count number of bytes to write + * @param nwritten number of bytes written + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_fid_logio_write(int fid, + unifyfs_filemeta_t* meta, + off_t pos, + const void* buf, + size_t count, + size_t* nwritten) { - /* determine the number of chunks to leave after truncating */ - off_t num_chunks = 0; - if (length > 0) { - num_chunks = (length >> unifyfs_chunk_bits) + 1; - } + /* assume we'll fail to write anything */ + *nwritten = 0; - /* clear off any extra chunks */ - while (meta->chunks > num_chunks) { - meta->chunks--; - unifyfs_chunk_free(fid, meta, meta->chunks); + assert(meta != NULL); + if (meta->storage != FILE_STORAGE_LOGIO) { + LOGERR("file (fid=%d) storage mode != FILE_STORAGE_LOGIO", fid); + return EINVAL; } - return UNIFYFS_SUCCESS; -} - -/* read data from file stored as fixed-size chunks */ -int unifyfs_fid_store_fixed_read(int fid, unifyfs_filemeta_t* meta, off_t pos, - void* buf, size_t count) -{ - int rc; - - /* get pointer to position within first chunk */ - int chunk_id = pos >> unifyfs_chunk_bits; - off_t chunk_offset = pos & unifyfs_chunk_mask; - - /* determine how many bytes remain in the current chunk */ - size_t remaining = unifyfs_chunk_size - chunk_offset; - if (count <= remaining) { - /* all bytes for this read fit within the current chunk */ - rc = unifyfs_chunk_read(meta, chunk_id, chunk_offset, buf, count); - } else { - /* read what's left of current chunk */ - char* ptr = (char*) buf; - rc = unifyfs_chunk_read(meta, chunk_id, - chunk_offset, (void*)ptr, remaining); - ptr += remaining; - - /* read from the next chunk */ - size_t processed = remaining; - while (processed < count && rc == UNIFYFS_SUCCESS) { - /* get pointer to start of next chunk */ - chunk_id++; - - /* compute size to read from this chunk */ - size_t num = count - processed; - if (num > unifyfs_chunk_size) { - num = unifyfs_chunk_size; - } - - /* read data */ - rc = unifyfs_chunk_read(meta, chunk_id, 0, (void*)ptr, num); - ptr += num; - - /* update number of bytes written */ - processed += num; - } + /* allocate space in the log for this write */ + off_t log_off; + int rc = unifyfs_logio_alloc(logio_ctx, count, &log_off); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("logio_alloc(%zu) failed", count); + return rc; } - return rc; -} - -/* write data to file stored as fixed-size chunks */ -int unifyfs_fid_store_fixed_write(int fid, unifyfs_filemeta_t* meta, off_t pos, - const void* buf, size_t count) -{ - int rc; - - /* get pointer to position within first chunk */ - int chunk_id; - off_t chunk_offset; - - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - chunk_id = pos >> unifyfs_chunk_bits; - chunk_offset = pos & unifyfs_chunk_mask; - } else if (meta->storage == FILE_STORAGE_LOGIO) { - chunk_id = meta->size >> unifyfs_chunk_bits; - chunk_offset = meta->size & unifyfs_chunk_mask; - } else { - return UNIFYFS_ERROR_IO; + /* do the write */ + rc = unifyfs_logio_write(logio_ctx, log_off, count, buf, nwritten); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("logio_write(%zu, %zu) failed", log_off, count); + return rc; } - /* determine how many bytes remain in the current chunk */ - size_t remaining = unifyfs_chunk_size - chunk_offset; - if (count <= remaining) { - /* all bytes for this write fit within the current chunk */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - rc = unifyfs_chunk_write(meta, chunk_id, chunk_offset, buf, count); - } else if (meta->storage == FILE_STORAGE_LOGIO) { - rc = unifyfs_logio_chunk_write(fid, pos, meta, chunk_id, chunk_offset, - buf, count); - } else { - return UNIFYFS_ERROR_IO; - } + if (*nwritten < count) { + LOGWARN("partial logio_write() @ offset=%zu (%zu of %zu bytes)", + (size_t)log_off, *nwritten, count); } else { - /* otherwise, fill up the remainder of the current chunk */ - char* ptr = (char*) buf; - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - rc = unifyfs_chunk_write(meta, chunk_id, - chunk_offset, (void*)ptr, remaining); - } else if (meta->storage == FILE_STORAGE_LOGIO) { - rc = unifyfs_logio_chunk_write(fid, pos, meta, chunk_id, - chunk_offset, (void*)ptr, remaining); - } else { - return UNIFYFS_ERROR_IO; - } - - ptr += remaining; - pos += remaining; - - /* then write the rest of the bytes starting from beginning - * of chunks */ - size_t processed = remaining; - while (processed < count && rc == UNIFYFS_SUCCESS) { - /* get pointer to start of next chunk */ - chunk_id++; - - /* compute size to write to this chunk */ - size_t num = count - processed; - if (num > unifyfs_chunk_size) { - num = unifyfs_chunk_size; - } - - /* write data */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - rc = unifyfs_chunk_write(meta, chunk_id, 0, (void*)ptr, num); - } else if (meta->storage == FILE_STORAGE_LOGIO) { - rc = unifyfs_logio_chunk_write(fid, pos, meta, chunk_id, 0, - (void*)ptr, num); - } else { - return UNIFYFS_ERROR_IO; - } - ptr += num; - pos += num; - - /* update number of bytes processed */ - processed += num; - } + LOGDBG("fid=%d pos=%zu - successful logio_write() " + "@ log offset=%zu (%zu bytes)", + fid, (size_t)pos, (size_t)log_off, count); } + /* update our write metadata for this write */ + rc = add_write_meta_to_index(meta, pos, log_off, *nwritten); return rc; } diff --git a/client/src/unifyfs-fixed.h b/client/src/unifyfs-fixed.h index dbd46b4e9..8053e0e7c 100644 --- a/client/src/unifyfs-fixed.h +++ b/client/src/unifyfs-fixed.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -45,40 +45,23 @@ #include "unifyfs-internal.h" -/* if length is greater than reserved space, - * reserve space up to length */ -int unifyfs_fid_store_fixed_extend( - int fid, /* file id to reserve space for */ - unifyfs_filemeta_t* meta, /* meta data for file */ - off_t length /* number of bytes to reserve for file */ -); +/* rewrite client's shared memory index of file write extents */ +off_t unifyfs_rewrite_index_from_seg_tree(unifyfs_filemeta_t* meta); -/* if length is shorter than reserved space, - * give back space down to length */ -int unifyfs_fid_store_fixed_shrink( - int fid, /* file id to free space for */ - unifyfs_filemeta_t* meta, /* meta data for file */ - off_t length /* number of bytes to reserve for file */ -); +/* remove/truncate write extents in client metadata */ +int truncate_write_meta(unifyfs_filemeta_t* meta, off_t trunc_sz); -/* read data from file stored as fixed-size chunks, - * returns UNIFYFS error code */ -int unifyfs_fid_store_fixed_read( - int fid, /* file id to read from */ - unifyfs_filemeta_t* meta, /* meta data for file */ - off_t pos, /* position within file to read from */ - void* buf, /* user buffer to store data in */ - size_t count /* number of bytes to read */ -); +/* sync all writes for target file(s) with the server */ +int unifyfs_sync(int target_fid); -/* write data to file stored as fixed-size chunks, - * returns UNIFYFS error code */ -int unifyfs_fid_store_fixed_write( - int fid, /* file id to write to */ +/* write data to file using log-based I/O */ +int unifyfs_fid_logio_write( + int fid, /* file id to write to */ unifyfs_filemeta_t* meta, /* meta data for file */ - off_t pos, /* position within file to write to */ - const void* buf, /* user buffer holding data */ - size_t count /* number of bytes to write */ + off_t pos, /* file position to start writing at */ + const void* buf, /* user buffer holding data */ + size_t count, /* number of bytes to write */ + size_t* nwritten /* returns number of bytes written */ ); #endif /* UNIFYFS_FIXED_H */ diff --git a/client/src/unifyfs-internal.h b/client/src/unifyfs-internal.h index f31504f18..6329780cc 100644 --- a/client/src/unifyfs-internal.h +++ b/client/src/unifyfs-internal.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -57,6 +57,10 @@ * ------------------------------- */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + // system headers #include #include @@ -85,17 +89,22 @@ #include #include -#define _GNU_SOURCE #include #include +#ifdef HAVE_SYS_STATFS_H +#include +#endif + // common headers #include "unifyfs_configurator.h" #include "unifyfs_const.h" #include "unifyfs_keyval.h" #include "unifyfs_log.h" +#include "unifyfs_logio.h" #include "unifyfs_meta.h" #include "unifyfs_shm.h" +#include "seg_tree.h" // client headers #include "unifyfs.h" @@ -114,26 +123,32 @@ unifyfs_unsupported(__func__, __FILE__, __LINE__, fmt, ##args) #ifdef UNIFYFS_GOTCHA +#include -/* gotcha fills in address of original/real function - * and we need to declare function prototype for each - * wrapper */ -#define UNIFYFS_DECL(name,ret,args) \ - extern ret(*__real_ ## name)args; \ - ret __wrap_ ## name args; - -/* define each DECL function in a .c file */ -#define UNIFYFS_DEF(name,ret,args) \ - ret(*__real_ ## name)args = NULL; - -/* we define our wrapper function as __wrap_ instead of */ +/* the name of our wrapper - we use __wrap_ instead of */ #define UNIFYFS_WRAP(name) __wrap_ ## name -/* gotcha maps the call to __real_() */ +/* the name of the real function pointer */ #define UNIFYFS_REAL(name) __real_ ## name -/* no need to look up the address of the real function (gotcha does that) */ -#define MAP_OR_FAIL(func) +/* declare anything that will be used externally */ +#define UNIFYFS_DECL(name, ret, args) \ + extern gotcha_wrappee_handle_t wrappee_handle_ ## name; \ + extern ret (*__real_ ## name) args; \ + ret __wrap_ ## name args + +/* ask gotcha for the address of the real function */ +#define MAP_OR_FAIL(name) \ +do { \ + if (NULL == __real_ ## name) { \ + __real_ ## name = gotcha_get_wrappee(wrappee_handle_ ## name); \ + if (NULL == __real_ ## name) { \ + assert(!"missing Gotcha wrappee for " #name); \ + } \ + } \ +} while (0) + +int setup_gotcha_wrappers(void); #elif UNIFYFS_PRELOAD @@ -145,7 +160,6 @@ * dlsym */ /* we need the dlsym function */ -#define __USE_GNU #include /* define a static variable called __real_open to record address of @@ -204,6 +218,7 @@ typedef struct { off_t pos; /* current file pointer */ int read; /* whether file is opened for read */ int write; /* whether file is opened for write */ + int append; /* whether file is opened for append */ } unifyfs_fd_t; enum unifyfs_stream_orientation { @@ -251,37 +266,27 @@ enum flock_enum { SH_LOCKED }; -/* TODO: make this an enum */ -#define FILE_STORAGE_NULL 0 -#define FILE_STORAGE_FIXED_CHUNK 1 -#define FILE_STORAGE_LOGIO 2 +enum {FILE_STORAGE_NULL = 0, FILE_STORAGE_LOGIO}; -/* TODO: make this an enum */ -#define CHUNK_LOCATION_NULL 0 -#define CHUNK_LOCATION_MEMFS 1 -#define CHUNK_LOCATION_SPILLOVER 2 typedef struct { - int location; /* CHUNK_LOCATION type */ - off_t id; /* physical id of chunk in its respective storage */ -} unifyfs_chunkmeta_t; - -typedef struct { - off_t size; /* current file size */ - off_t log_size; /* real size of the file for logio*/ - pthread_spinlock_t fspinlock; /* file lock variable */ - enum flock_enum flock_status; /* file lock status */ - - int storage; /* FILE_STORAGE type */ - - int needs_sync; /* have unsynced writes */ - - off_t chunks; /* number of chunks allocated to file */ - off_t chunkmeta_idx; /* starting index in unifyfs_chunkmeta */ - int is_laminated; /* Is this file laminated */ - uint32_t mode; /* st_mode bits. This has file - * permission info and will tell you if this - * is a regular file or directory. */ + off_t global_size; /* Global size of the file */ + pthread_spinlock_t fspinlock; /* file lock variable */ + enum flock_enum flock_status; /* file lock status */ + + int storage; /* FILE_STORAGE type */ + + int fid; /* local file index in filemetas array */ + int gfid; /* global file id for this file */ + int needs_sync; /* have unsynced writes */ + + int is_laminated; /* Is this file laminated */ + uint32_t mode; /* st_mode bits. This has file + * permission info and will tell you if this + * is a regular file or directory. */ + struct seg_tree extents_sync; /* Segment tree containing our coalesced + * writes between sync operations */ + struct seg_tree extents; /* Segment tree of all local data extents */ } unifyfs_filemeta_t; /* struct used to map a full path to its local file id, @@ -296,12 +301,24 @@ typedef struct { } unifyfs_filename_t; /*unifyfs structures*/ + +/* This structure defines a client read request for a file. + * It is initialized by the client describing the global file id, + * offset, and length to be read and provides a pointer to + * the user buffer where the data should be placed. The + * server sets the errcode field to UNIFYFS_SUCCESS if the read + * succeeds and otherwise records an error code pertaining to + * why the read failed. The server records the number of bytes + * read in the nread field, which the client can use to detect + * short read operations. */ typedef struct { - int fid; - int errcode; - size_t offset; - size_t length; - char* buf; + int gfid; /* global file id to be read */ + int errcode; /* error code for read operation if any */ + size_t offset; /* logical offset in file to read from */ + size_t length; /* number of bytes to read */ + size_t nread; /* number of bytes actually read */ + char* buf; /* pointer to user buffer to place data */ + struct aiocb* aiocbp; /* the original request from application */ } read_req_t; typedef struct { @@ -309,37 +326,17 @@ typedef struct { unifyfs_index_t* index_entry; } unifyfs_index_buf_t; -typedef struct { - size_t* ptr_num_entries; - unifyfs_file_attr_t* meta_entry; -} unifyfs_fattr_buf_t; - -typedef struct { - unifyfs_index_t idxes[UNIFYFS_MAX_SPLIT_CNT]; - int count; -} index_set_t; - -typedef struct { - read_req_t read_reqs[UNIFYFS_MAX_READ_CNT]; - int count; -} read_req_set_t; - extern unifyfs_index_buf_t unifyfs_indices; extern unsigned long unifyfs_max_index_entries; -extern long unifyfs_spillover_max_chunks; -extern int local_rank_cnt; -extern int local_rank_idx; -extern int local_del_cnt; -extern int client_sockfd; -extern struct pollfd cmd_fd; -extern void* shm_req_buf; -extern void* shm_recv_buf; -extern char cmd_buf[CMD_BUF_SIZE]; -extern unifyfs_fattr_buf_t unifyfs_fattrs; +/* shmem context for read-request replies data region */ +extern shm_context* shm_recv_ctx; -extern int app_id; -extern size_t unifyfs_key_slice_range; +/* log-based I/O context */ +extern logio_context* logio_ctx; + +extern int unifyfs_app_id; +extern int unifyfs_client_id; /* ------------------------------- * Global varaible declarations @@ -366,6 +363,9 @@ extern unifyfs_filename_t* unifyfs_filelist; extern char* unifyfs_mount_prefix; extern size_t unifyfs_mount_prefixlen; +/* tracks current working directory within unifyfs directory namespace */ +extern char* unifyfs_cwd; + /* array of file descriptors */ extern unifyfs_fd_t unifyfs_fds[UNIFYFS_MAX_FILEDESCS]; extern rlim_t unifyfs_fd_limit; @@ -388,24 +388,11 @@ extern void* unifyfs_stream_stack; * each is an index into unifyfs_dirstreams array */ extern void* unifyfs_dirstream_stack; -extern int unifyfs_use_memfs; -extern int unifyfs_use_spillover; +/* mutex to lock stack operations */ +extern pthread_mutex_t unifyfs_stack_mutex; extern int unifyfs_max_files; /* maximum number of files to store */ -extern size_t -unifyfs_chunk_mem; /* number of bytes in memory to be used for chunk storage */ -extern int unifyfs_chunk_bits; /* we set chunk size = 2^unifyfs_chunk_bits */ -extern off_t unifyfs_chunk_size; /* chunk size in bytes */ -extern off_t -unifyfs_chunk_mask; /* mask applied to logical offset to determine physical offset within chunk */ -extern long -unifyfs_max_chunks; /* maximum number of chunks that fit in memory */ - -extern void* free_chunk_stack; -extern void* free_spillchunk_stack; -extern char* unifyfs_chunks; -extern unifyfs_chunkmeta_t* unifyfs_chunkmetas; -int unifyfs_spilloverblock; +extern bool unifyfs_local_extents; /* enable tracking of local extents */ /* ------------------------------- * Common functions @@ -423,16 +410,14 @@ int unifyfs_would_overflow_offt(off_t a, off_t b); * added together */ int unifyfs_would_overflow_long(long a, long b); -/* given an input mode, mask it with umask and return, can specify - * an input mode==0 to specify all read/write bits */ -mode_t unifyfs_getmode(mode_t perms); - -int unifyfs_stack_lock(); +int unifyfs_stack_lock(void); -int unifyfs_stack_unlock(); +int unifyfs_stack_unlock(void); -/* sets flag if the path is a special path */ -int unifyfs_intercept_path(const char* path); +/* sets flag if the path should be intercept as a unifyfs path, + * and if so, writes normalized path in upath, which should + * be a buffer of size UNIFYFS_MAX_FILENAME */ +int unifyfs_intercept_path(const char* path, char* upath); /* given an fd, return 1 if we should intercept this file, 0 otherwise, * convert fd to new fd value if needed */ @@ -470,14 +455,21 @@ unifyfs_fd_t* unifyfs_get_filedesc_from_fd(int fd); * otherwise return NULL */ unifyfs_filemeta_t* unifyfs_get_meta_from_fid(int fid); +/* Return 1 if fid is laminated, 0 if not */ +int unifyfs_fid_is_laminated(int fid); + +/* Return 1 if fd is laminated, 0 if not */ +int unifyfs_fd_is_laminated(int fd); + /* Given a fid, return the path. */ const char* unifyfs_path_from_fid(int fid); -/* given an UNIFYFS error code, return corresponding errno code */ -int unifyfs_err_map_to_errno(int rc); +/* Given a fid, return a gfid */ +int unifyfs_gfid_from_fid(const int fid); -/* given an errno error code, return corresponding UnifyFS error code */ -int unifyfs_errno_map_to_err(int rc); +/* returns fid for corresponding gfid, if one is active, + * returns -1 otherwise */ +int unifyfs_fid_from_gfid(const int gfid); /* checks to see if fid is a directory * returns 1 for yes @@ -492,18 +484,26 @@ int unifyfs_fid_is_dir(int fid); * returns 0 for no */ int unifyfs_fid_is_dir_empty(const char* path); -/* return current size of given file id */ -off_t unifyfs_fid_size(int fid); +/* Return current global size of given file id */ +off_t unifyfs_fid_global_size(int fid); -/* fill in limited amount of stat information for global file id */ -int unifyfs_gfid_stat(int gfid, struct stat* buf); +/* if we have a local fid structure corresponding to the gfid + * in question, we attempt the file lookup with the fid method + * otherwise call back to the rpc */ +off_t unifyfs_gfid_filesize(int gfid); -/* fill in limited amount of stat information */ -int unifyfs_fid_stat(int fid, struct stat* buf); +/* + * Return current size of given file id. If the file is laminated, return the + * global size. Otherwise, return the local size. + */ +off_t unifyfs_fid_logical_size(int fid); + +/* Update local metadata for file from global metadata */ +int unifyfs_fid_update_file_meta(int fid, unifyfs_file_attr_t* gfattr); /* allocate a file id slot for a new file * return the fid or -1 on error */ -int unifyfs_fid_alloc(); +int unifyfs_fid_alloc(void); /* return the file id back to the free pool */ int unifyfs_fid_free(int fid); @@ -516,30 +516,23 @@ int unifyfs_fid_create_file(const char* path); * returns the new fid, or a negative value on error */ int unifyfs_fid_create_directory(const char* path); -/* read count bytes from file starting from pos and store into buf, - * all bytes are assumed to exist, so checks on file size should be - * done before calling this routine */ -int unifyfs_fid_read(int fid, off_t pos, void* buf, size_t count); - -/* write count bytes from buf into file starting at offset pos, - * all bytes are assumed to be allocated to file, so file should - * be extended before calling this routine */ -int unifyfs_fid_write(int fid, off_t pos, const void* buf, size_t count); - -/* given a file id, write zero bytes to region of specified offset - * and length, assumes space is already reserved */ -int unifyfs_fid_write_zero(int fid, off_t pos, off_t count); - -/* increase size of file if length is greater than current size, - * and allocate additional chunks as needed to reserve space for - * length bytes */ -int unifyfs_fid_extend(int fid, off_t length); +/* write count bytes from buf into file starting at offset pos */ +int unifyfs_fid_write( + int fid, /* local file id to write to */ + off_t pos, /* starting offset within file */ + const void* buf, /* buffer of data to be written */ + size_t count, /* number of bytes to write */ + size_t* nwritten /* returns number of bytes written */ +); /* truncate file id to given length, frees resources if length is * less than size and allocates and zero-fills new bytes if length * is more than size */ int unifyfs_fid_truncate(int fid, off_t length); +/* sync data for file id to server if needed */ +int unifyfs_fid_sync(int fid); + /* opens a new file id with specified path, access flags, and permissions, * fills outfid with file id and outpos with position for current file pointer, * returns UNIFYFS error code */ @@ -554,11 +547,17 @@ int unifyfs_fid_unlink(int fid); /* functions used in UnifyFS */ -int unifyfs_generate_gfid(const char* path); +/* issue a set of read requests */ +int unifyfs_gfid_read_reqs(read_req_t* in_reqs, int in_count); -int unifyfs_set_global_file_meta(int fid, int gfid); +int unifyfs_set_global_file_meta_from_fid(int fid, + unifyfs_file_attr_op_e op); + +int unifyfs_set_global_file_meta(int gfid, + unifyfs_file_attr_op_e op, + unifyfs_file_attr_t* gfattr); -int unifyfs_get_global_file_meta(int fid, int gfid, +int unifyfs_get_global_file_meta(int gfid, unifyfs_file_attr_t* gfattr); // These require types/structures defined above diff --git a/client/src/unifyfs-stdio.c b/client/src/unifyfs-stdio.c index 0c3787023..7bc1fea79 100644 --- a/client/src/unifyfs-stdio.c +++ b/client/src/unifyfs-stdio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -152,7 +152,7 @@ int unifyfs_stream_set_pointers(unifyfs_stream_t* s) /* ERROR: invalid file descriptor */ s->err = 1; errno = EBADF; - return UNIFYFS_ERROR_BADF; + return EBADF; } /* if we have anything on the push back buffer, that must be @@ -189,7 +189,7 @@ int unifyfs_stream_set_pointers(unifyfs_stream_t* s) /* given a mode like "r", "wb+", or "a+" return flags read, write, * append, and plus to indicate which were set, - * returns UNIFYFS_ERROR_INVAL if invalid character is found + * returns EINVAL if invalid character is found */ static int unifyfs_fopen_parse_mode( const char* mode, @@ -206,13 +206,13 @@ static int unifyfs_fopen_parse_mode( /* ensure that user specified an input mode */ if (mode == NULL) { - return UNIFYFS_ERROR_INVAL; + return EINVAL; } /* get number of characters in mode */ size_t len = strlen(mode); if (len <= 0 || len > 3) { - return UNIFYFS_ERROR_INVAL; + return EINVAL; } /* first character must either be r, w, or a */ @@ -228,7 +228,7 @@ static int unifyfs_fopen_parse_mode( *append = 1; break; default: - return UNIFYFS_ERROR_INVAL; + return EINVAL; } /* optional second character may either be + or b */ @@ -243,7 +243,7 @@ static int unifyfs_fopen_parse_mode( char third = mode[2]; if (third != 'b') { /* third character something other than + or b */ - return UNIFYFS_ERROR_INVAL; + return EINVAL; } } } else if (second == 'b') { @@ -254,12 +254,12 @@ static int unifyfs_fopen_parse_mode( *plus = 1; } else { /* third character something other than + or b */ - return UNIFYFS_ERROR_INVAL; + return EINVAL; } } } else { /* second character something other than + or b */ - return UNIFYFS_ERROR_INVAL; + return EINVAL; } } @@ -295,7 +295,7 @@ static int unifyfs_fopen( off_t pos; if (read) { /* read shall fail if file does not already exist, unifyfs_fid_open - * returns UNIFYFS_ERROR_NOENT if file does not exist w/o O_CREAT + * returns ENOENT if file does not exist w/o O_CREAT */ if (plus) { /* r+ ==> open file for update (reading and writing) */ @@ -342,7 +342,7 @@ static int unifyfs_fopen( * process has hit file stream limit, not the OS */ /* exhausted our file streams */ - return UNIFYFS_ERROR_NFILE; + return ENFILE; } /* get stream structure corresponding to stream id */ @@ -358,7 +358,7 @@ static int unifyfs_fopen( unifyfs_stack_push(unifyfs_stream_stack, sid); /* exhausted our file descriptors */ - return UNIFYFS_ERROR_NFILE; + return ENFILE; } /* set file pointer and read/write mode in file descriptor */ @@ -366,7 +366,7 @@ static int unifyfs_fopen( filedesc->fid = fid; filedesc->pos = pos; filedesc->read = read || plus; - filedesc->write = write || plus; + filedesc->write = write || plus || append; /* record our stream id value */ s->sid = sid; @@ -422,19 +422,19 @@ static int unifyfs_setvbuf( /* check whether we've already associated a buffer */ if (s->buf != NULL) { /* ERROR: stream already has buffer */ - return UNIFYFS_ERROR_BADF; + return EBADF; } /* check that the type argument is valid */ if (type != _IOFBF && type != _IOLBF && type != _IONBF) { /* ERROR: invalid type argument */ - return UNIFYFS_ERROR_INVAL; + return EINVAL; } /* check that size is valid */ if (size <= 0) { /* ERROR: invalid size argument */ - return UNIFYFS_ERROR_INVAL; + return EINVAL; } /* associate buffer with stream */ @@ -443,7 +443,7 @@ static int unifyfs_setvbuf( s->buf = malloc(size); if (s->buf == NULL) { /* ERROR: no memory */ - return UNIFYFS_ERROR_NOMEM; + return ENOMEM; } /* remember that we need to free the buffer at the end */ s->buffree = 1; @@ -476,13 +476,39 @@ static int unifyfs_stream_flush(FILE* stream) /* if buffer is dirty, write data to file */ if (s->buf != NULL && s->bufdirty) { - int write_rc = unifyfs_fd_write(s->fd, s->bufpos, s->buf, s->buflen); + size_t nwritten = 0; + int write_rc = unifyfs_fd_write(s->fd, s->bufpos, s->buf, s->buflen, + &nwritten); if (write_rc != UNIFYFS_SUCCESS) { + /* ERROR: set stream error indicator and errno */ s->err = 1; - errno = unifyfs_err_map_to_errno(write_rc); + errno = unifyfs_rc_errno(write_rc); return write_rc; } + /* TODO: treat short writes as error? */ + + /* note there is no need to update the file descriptor position here + * since we wrote to a specific offset independent of the file + * descriptor position */ + + /* lookup file id from file descriptor attached to stream */ + int fid = unifyfs_get_fid_from_fd(s->fd); + if (fid < 0) { + s->err = 1; + errno = EBADF; + return EBADF; + } + + /* invoke fsync rpc to register index metadata with server */ + int ret = unifyfs_fid_sync(fid); + if (ret != UNIFYFS_SUCCESS) { + /* sync failed for some reason, set errno and return error */ + s->err = 1; + errno = unifyfs_rc_errno(ret); + return ret; + } + /* indicate that buffer is now flushed */ s->bufdirty = 0; } @@ -493,7 +519,7 @@ static int unifyfs_stream_flush(FILE* stream) /* reads count bytes from stream into buf, sets stream EOF and error * indicators as appropriate, sets errno if error, updates file * position, returns number of bytes read in retcount, returns UNIFYFS - * error codes*/ + * error codes */ static int unifyfs_stream_read( FILE* stream, void* buf, @@ -513,14 +539,17 @@ static int unifyfs_stream_read( /* ERROR: invalid file descriptor */ s->err = 1; errno = EBADF; - return UNIFYFS_ERROR_BADF; + LOGDBG("Invalid file descriptor"); + return EBADF; } /* bail with error if stream not open for reading */ if (!filedesc->read) { s->err = 1; errno = EBADF; - return UNIFYFS_ERROR_BADF; + LOGDBG("Stream not open for reading"); + + return EBADF; } /* associate buffer with stream if we need to */ @@ -530,13 +559,15 @@ static int unifyfs_stream_read( if (setvbuf_rc != UNIFYFS_SUCCESS) { /* ERROR: failed to associate buffer */ s->err = 1; - errno = unifyfs_err_map_to_errno(setvbuf_rc); + errno = unifyfs_rc_errno(setvbuf_rc); + LOGDBG("Couldn't setvbuf"); return setvbuf_rc; } } /* don't attempt read if end-of-file indicator is set */ if (s->eof) { + LOGDBG("Stop read, at EOF"); return UNIFYFS_FAILURE; } @@ -549,7 +580,7 @@ static int unifyfs_stream_read( if (unifyfs_would_overflow_offt(current, (off_t) count)) { s->err = 1; errno = EOVERFLOW; - return UNIFYFS_ERROR_OVERFLOW; + return EOVERFLOW; } /* take bytes from push back buffer if they exist */ @@ -593,21 +624,22 @@ static int unifyfs_stream_read( } /* read data from file into buffer */ - size_t bufcount; - int read_rc = unifyfs_fd_read(s->fd, current, s->buf, s->bufsize, &bufcount); + size_t nread = 0; + int read_rc = unifyfs_fd_read(s->fd, current, s->buf, + s->bufsize, &nread); if (read_rc != UNIFYFS_SUCCESS) { - /* ERROR: read error, set error indicator and errno */ + /* ERROR: set error indicator and errno */ s->err = 1; - errno = unifyfs_err_map_to_errno(read_rc); - return read_rc; + errno = unifyfs_rc_errno(read_rc); + return EIO; } /* record new buffer range within file */ - s->bufpos = current; - s->buflen = bufcount; + s->bufpos = current; + s->buflen = nread; /* set end-of-file flag if our read was short */ - if (bufcount < s->bufsize) { + if (s->buflen < s->bufsize) { eof = 1; } } @@ -637,7 +669,7 @@ static int unifyfs_stream_read( *retcount = (count - remaining); /* update file position */ - filedesc->pos += (off_t) * retcount; + filedesc->pos += (off_t) *retcount; /* set end of file indicator if we hit the end */ if (*retcount < count) { @@ -669,16 +701,18 @@ static int unifyfs_stream_write( unifyfs_fd_t* filedesc = unifyfs_get_filedesc_from_fd(s->fd); if (filedesc == NULL) { /* ERROR: invalid file descriptor */ + LOGDBG("Bad file descriptor"); s->err = 1; errno = EBADF; - return UNIFYFS_ERROR_BADF; + return EBADF; } /* bail with error if stream not open for writing */ if (!filedesc->write) { + LOGDBG("Stream not open for writing"); s->err = 1; errno = EBADF; - return UNIFYFS_ERROR_BADF; + return EBADF; } /* TODO: Don't know what to do with push back bytes if write @@ -692,9 +726,9 @@ static int unifyfs_stream_write( if (fid < 0) { s->err = 1; errno = EBADF; - return UNIFYFS_ERROR_BADF; + return EBADF; } - current = unifyfs_fid_size(fid); + current = unifyfs_fid_logical_size(fid); /* like a seek, we discard push back bytes */ s->ubuflen = 0; @@ -716,7 +750,7 @@ static int unifyfs_stream_write( if (unifyfs_would_overflow_offt(current, (off_t) count)) { s->err = 1; errno = EFBIG; - return UNIFYFS_ERROR_FBIG; + return EFBIG; } /* associate buffer with stream if we need to */ @@ -726,7 +760,7 @@ static int unifyfs_stream_write( if (setvbuf_rc != UNIFYFS_SUCCESS) { /* ERROR: failed to associate buffer */ s->err = 1; - errno = unifyfs_err_map_to_errno(setvbuf_rc); + errno = unifyfs_rc_errno(setvbuf_rc); return setvbuf_rc; } } @@ -734,16 +768,19 @@ static int unifyfs_stream_write( /* if unbuffered, write data directly to file */ if (s->buftype == _IONBF) { /* write data directly to file */ - int write_rc = unifyfs_fd_write(s->fd, current, buf, count); + size_t nwritten = 0; + int write_rc = unifyfs_fd_write(s->fd, current, buf, count, &nwritten); if (write_rc != UNIFYFS_SUCCESS) { - /* ERROR: write error, set error indicator and errno */ + /* ERROR: set stream error indicator and errno */ s->err = 1; - errno = unifyfs_err_map_to_errno(write_rc); + errno = unifyfs_rc_errno(write_rc); return write_rc; } + /* TODO: treat short writes as error? */ + /* update file position */ - filedesc->pos = current + (off_t) count; + filedesc->pos = current + (off_t) nwritten; return UNIFYFS_SUCCESS; } @@ -788,7 +825,7 @@ static int unifyfs_stream_write( /* ERROR: write error, set error indicator and errno */ s->err = 1; errno = ENOMEM; - return UNIFYFS_ERROR_NOMEM; + return ENOMEM; } } else { /* fully buffered, write until we hit the buffer limit */ @@ -882,6 +919,11 @@ static int unifyfs_fseek(FILE* stream, off_t offset, int whence) switch (whence) { case SEEK_SET: /* seek to offset */ + if (offset < 0) { + /* negative offset is invalid */ + errno = EINVAL; + return -1; + } current_pos = offset; break; case SEEK_CUR: @@ -891,16 +933,26 @@ static int unifyfs_fseek(FILE* stream, off_t offset, int whence) errno = EOVERFLOW; return -1; } + if (current_pos + offset < 0) { + /* offset is negative and will result in a negative position */ + errno = EINVAL; + return -1; + } current_pos += offset; break; case SEEK_END: /* seek to EOF + offset */ - filesize = unifyfs_fid_size(fid); + filesize = unifyfs_fid_logical_size(fid); if (unifyfs_would_overflow_offt(filesize, offset)) { s->err = 1; errno = EOVERFLOW; return -1; } + if (filesize + offset < 0) { + /* offset is negative and will result in negative position */ + errno = EINVAL; + return -1; + } current_pos = filesize + offset; break; default: @@ -928,11 +980,12 @@ static int unifyfs_fseek(FILE* stream, off_t offset, int whence) FILE* UNIFYFS_WRAP(fopen)(const char* path, const char* mode) { /* check whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { FILE* stream; - int rc = unifyfs_fopen(path, mode, &stream); + int rc = unifyfs_fopen(upath, mode, &stream); if (rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(rc); + errno = unifyfs_rc_errno(rc); return NULL; } return stream; @@ -969,7 +1022,7 @@ int UNIFYFS_WRAP(setvbuf)(FILE* stream, char* buf, int type, size_t size) if (unifyfs_intercept_stream(stream)) { int rc = unifyfs_setvbuf(stream, buf, type, size); if (rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(rc); + errno = unifyfs_rc_errno(rc); return 1; } return 0; @@ -1548,13 +1601,17 @@ void UNIFYFS_WRAP(rewind)(FILE* stream) /* lookup stream */ unifyfs_stream_t* s = (unifyfs_stream_t*) stream; - /* TODO: check that stream is active */ + /* check that stream is active */ + if (s->fd < 0) { + errno = EBADF; + return; + } /* seek to front of file */ int rc = unifyfs_fseek(stream, (off_t) 0L, SEEK_SET); /* set errno */ - errno = unifyfs_err_map_to_errno(rc); + errno = unifyfs_rc_errno(rc); /* clear error indicator if seek successful */ if (rc == 0) { @@ -1689,7 +1746,6 @@ int UNIFYFS_WRAP(fflush)(FILE* stream) /* TODO: check that stream is active */ /* flush output on stream */ int rc = unifyfs_stream_flush(stream); - if (rc != UNIFYFS_SUCCESS) { /* ERROR: flush sets error indicator and errno */ return EOF; @@ -1825,7 +1881,7 @@ int UNIFYFS_WRAP(fclose)(FILE* stream) /* close the file */ int close_rc = unifyfs_fid_close(fid); if (close_rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(close_rc); + errno = unifyfs_rc_errno(close_rc); return EOF; } @@ -2229,7 +2285,7 @@ static int __srefill(unifyfs_stream_t* stream) if (setvbuf_rc != UNIFYFS_SUCCESS) { /* ERROR: failed to associate buffer */ s->err = 1; - errno = unifyfs_err_map_to_errno(setvbuf_rc); + errno = unifyfs_rc_errno(setvbuf_rc); return 1; } } @@ -2258,18 +2314,22 @@ static int __srefill(unifyfs_stream_t* stream) } /* read data from file into buffer */ - size_t bufcount; - int read_rc = unifyfs_fd_read(s->fd, current, s->buf, s->bufsize, &bufcount); + size_t nread = 0; + int read_rc = unifyfs_fd_read(s->fd, current, s->buf, s->bufsize, + &nread); if (read_rc != UNIFYFS_SUCCESS) { - /* ERROR: read error, set error indicator and errno */ + /* ERROR: set error indicator and errno */ s->err = 1; - errno = unifyfs_err_map_to_errno(read_rc); + errno = unifyfs_rc_errno(read_rc); return 1; } + /* update file descriptor position to account for bytes we just read */ + filedesc->pos = current + nread; + /* record new buffer range within file */ s->bufpos = current; - s->buflen = bufcount; + s->buflen = nread; } /* determine number of bytes to copy from stream buffer */ @@ -2573,6 +2633,7 @@ __svfscanf(unifyfs_stream_t* fp, const char* fmt0, va_list ap) char ccltab[256]; /* character class table for %[...] */ char buf[BUF]; /* buffer for numeric conversions */ + base = 0; nassigned = 0; nconversions = 0; nread = 0; diff --git a/client/src/unifyfs-stdio.h b/client/src/unifyfs-stdio.h index 447ea0634..e8bd68608 100644 --- a/client/src/unifyfs-stdio.h +++ b/client/src/unifyfs-stdio.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 0dd46c36a..b7d1ab249 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -43,14 +43,6 @@ #include "unifyfs-internal.h" #include "unifyfs-sysio.h" #include "margo_client.h" -#include "ucr_read_builder.h" - -/* ------------------- - * define external variables - * --------------------*/ - -extern int unifyfs_spilloverblock; -extern int unifyfs_use_spillover; /* --------------------------------------- * POSIX wrappers: paths @@ -59,17 +51,18 @@ extern int unifyfs_use_spillover; int UNIFYFS_WRAP(access)(const char* path, int mode) { /* determine whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* check if path exists */ - if (unifyfs_get_fid_from_path(path) < 0) { + if (unifyfs_get_fid_from_path(upath) < 0) { LOGDBG("access: unifyfs_get_id_from path failed, returning -1, %s", - path); + upath); errno = ENOENT; return -1; } /* currently a no-op */ - LOGDBG("access: path intercepted, returning 0, %s", path); + LOGDBG("access: path intercepted, returning 0, %s", upath); return 0; } else { LOGDBG("access: calling MAP_OR_FAIL, %s", path); @@ -88,19 +81,20 @@ int UNIFYFS_WRAP(mkdir)(const char* path, mode_t mode) * It doesn't check to see if parent directory exists */ /* determine whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* check if it already exists */ - if (unifyfs_get_fid_from_path(path) >= 0) { + if (unifyfs_get_fid_from_path(upath) >= 0) { errno = EEXIST; return -1; } /* add directory to file list */ - int ret = unifyfs_fid_create_directory(path); + int ret = unifyfs_fid_create_directory(upath); if (ret != UNIFYFS_SUCCESS) { /* failed to create the directory, * set errno and return */ - errno = unifyfs_err_map_to_errno(ret); + errno = unifyfs_rc_errno(ret); return -1; } @@ -116,15 +110,16 @@ int UNIFYFS_WRAP(mkdir)(const char* path, mode_t mode) int UNIFYFS_WRAP(rmdir)(const char* path) { /* determine whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* check if the mount point itself is being deleted */ - if (!strcmp(path, unifyfs_mount_prefix)) { + if (!strcmp(upath, unifyfs_mount_prefix)) { errno = EBUSY; return -1; } /* check if path exists */ - int fid = unifyfs_get_fid_from_path(path); + int fid = unifyfs_get_fid_from_path(upath); if (fid < 0) { errno = ENOENT; return -1; @@ -137,7 +132,7 @@ int UNIFYFS_WRAP(rmdir)(const char* path) } /* is it empty? */ - if (!unifyfs_fid_is_dir_empty(path)) { + if (!unifyfs_fid_is_dir_empty(upath)) { errno = ENOTEMPTY; return -1; } @@ -147,7 +142,7 @@ int UNIFYFS_WRAP(rmdir)(const char* path) if (ret != UNIFYFS_SUCCESS) { /* failed to remove the directory, * set errno and return */ - errno = unifyfs_err_map_to_errno(ret); + errno = unifyfs_rc_errno(ret); return -1; } @@ -160,32 +155,340 @@ int UNIFYFS_WRAP(rmdir)(const char* path) } } +int UNIFYFS_WRAP(chdir)(const char* path) +{ + /* determine whether we should intercept this path */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + /* TODO: check that path is not a file? */ + /* we're happy to change into any directory in unifyfs */ + if (unifyfs_cwd != NULL) { + free(unifyfs_cwd); + } + unifyfs_cwd = strdup(upath); + return 0; + } else { + MAP_OR_FAIL(chdir); + int ret = UNIFYFS_REAL(chdir)(path); + + /* if the change dir was successful, + * update our current working directory */ + if (unifyfs_initialized && ret == 0) { + if (unifyfs_cwd != NULL) { + free(unifyfs_cwd); + } + + /* if we did a real chdir, let's use a real getcwd + * to get the current working directory */ + MAP_OR_FAIL(getcwd); + char* cwd = UNIFYFS_REAL(getcwd)(NULL, 0); + if (cwd != NULL) { + unifyfs_cwd = cwd; + + /* parts of the code may assume unifyfs_cwd is a max size */ + size_t len = strlen(cwd) + 1; + if (len > UNIFYFS_MAX_FILENAME) { + LOGERR("Current working dir longer (%lu bytes) " + "than UNIFYFS_MAX_FILENAME=%d", + (unsigned long) len, UNIFYFS_MAX_FILENAME); + } + } else { + /* ERROR */ + LOGERR("Failed to getcwd after chdir(%s) errno=%d %s", + path, errno, strerror(errno)); + } + } + + return ret; + } +} + +/* common logic for getcwd and __getcwd_chk */ +static char* _getcwd_impl(char* path, size_t size) +{ + /* man page if size=0 and path not NULL, return EINVAL */ + if (size == 0 && path != NULL) { + errno = EINVAL; + return NULL; + } + + /* get length of current working dir */ + size_t len = strlen(unifyfs_cwd) + 1; + + /* if user didn't provide a buffer, + * we attempt to allocate and return one for them */ + if (path == NULL) { + /* we'll allocate a buffer to return to the caller */ + char* buf = NULL; + + /* if path is NULL and size is positive, we must + * allocate a buffer of length size and copy into it */ + if (size > 0) { + /* check that size is big enough for the string */ + if (len <= size) { + /* path will fit, allocate buffer and copy */ + buf = (char*) malloc(size); + if (buf != NULL) { + strncpy(buf, unifyfs_cwd, size); + } else { + errno = ENOMEM; + } + return buf; + } else { + /* user's buffer limit is too small */ + errno = ERANGE; + return NULL; + } + } + + /* otherwise size == 0, so allocate a buffer + * that is big enough */ + buf = (char*) malloc(len); + if (buf != NULL) { + strncpy(buf, unifyfs_cwd, len); + } else { + errno = ENOMEM; + } + return buf; + } + + /* to get here, caller provided an actual buffer, + * check that path fits in the caller's buffer */ + if (len <= size) { + /* current working dir fits, copy and return */ + strncpy(path, unifyfs_cwd, size); + return path; + } else { + /* user's buffer is too small */ + errno = ERANGE; + return NULL; + } +} + +char* UNIFYFS_WRAP(__getcwd_chk)(char* path, size_t size, size_t buflen) +{ + /* if we're initialized, we're tracking the current working dir */ + if (unifyfs_initialized) { + /* check that we have a string, + * return unusual error in case we don't */ + if (unifyfs_cwd == NULL) { + errno = EACCES; + return NULL; + } + + /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + * Otherwise, call the real getcwd, and if actual cwd does + * not match what we expect, throw an error (the user somehow + * changed dir without us noticing, so there is a bug here) */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(unifyfs_cwd, upath)) { +#if 0 + /* TODO: what to do here? */ + if (size > buflen) { + __chk_fail(); + } +#endif + + /* delegate the rest to our common getcwd function */ + return _getcwd_impl(path, size); + } else { + /* current working dir is in real file system, + * fall through to real getcwd call */ + MAP_OR_FAIL(__getcwd_chk); + char* ret = UNIFYFS_REAL(__getcwd_chk)(path, size, buflen); + + /* check that current working dir is what we think + * it should be as a sanity check */ + if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + LOGERR("Expcted cwd=%s vs actual=%s", + unifyfs_cwd, ret); + } + + return ret; + } + } else { + /* not initialized, so fall through to real __getcwd_chk */ + MAP_OR_FAIL(__getcwd_chk); + char* ret = UNIFYFS_REAL(__getcwd_chk)(path, size, buflen); + return ret; + } +} + +char* UNIFYFS_WRAP(getcwd)(char* path, size_t size) +{ + /* if we're initialized, we're tracking the current working dir */ + if (unifyfs_initialized) { + + /* check that we have a string, + * return unusual error in case we don't */ + if (unifyfs_cwd == NULL) { + errno = EACCES; + return NULL; + } + + /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + * Otherwise, call the real getcwd, and if actual cwd does + * not match what we expect, throw an error (the user somehow + * changed dir without us noticing, so there is a bug here) */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + /* delegate the rest to our common getcwd function */ + return _getcwd_impl(path, size); + } else { + /* current working dir is in real file system, + * fall through to real getcwd call */ + MAP_OR_FAIL(getcwd); + char* ret = UNIFYFS_REAL(getcwd)(path, size); + + /* check that current working dir is what we think + * it should be as a sanity check */ + if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + LOGERR("Expcted cwd=%s vs actual=%s", + unifyfs_cwd, ret); + } + + return ret; + } + } else { + /* not initialized, so fall through to real getcwd */ + MAP_OR_FAIL(getcwd); + char* ret = UNIFYFS_REAL(getcwd)(path, size); + return ret; + } +} + +char* UNIFYFS_WRAP(getwd)(char* path) +{ + /* if we're initialized, we're tracking the current working dir */ + if (unifyfs_initialized) { + /* check that we have a string, + * return unusual error in case we don't */ + if (unifyfs_cwd == NULL) { + errno = EACCES; + return NULL; + } + + /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + * Otherwise, call the real getwd, and if actual cwd does + * not match what we expect, throw an error (the user somehow + * changed dir without us noticing, so there is a bug here) */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + /* check that we got a valid path */ + if (path == NULL) { + errno = EINVAL; + return NULL; + } + + /* finally get length of current working dir and check + * that it fits in the caller's buffer */ + size_t len = strlen(unifyfs_cwd) + 1; + if (len <= PATH_MAX) { + strncpy(path, unifyfs_cwd, PATH_MAX); + return path; + } else { + /* user's buffer is too small */ + errno = ENAMETOOLONG; + return NULL; + } + } else { + /* current working dir is in real file system, + * fall through to real getwd call */ + MAP_OR_FAIL(getwd); + char* ret = UNIFYFS_REAL(getwd)(path); + + /* check that current working dir is what we think + * it should be as a sanity check */ + if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + LOGERR("Expcted cwd=%s vs actual=%s", + unifyfs_cwd, ret); + } + + return ret; + } + } else { + /* not initialized, so fall through to real getwd */ + MAP_OR_FAIL(getwd); + char* ret = UNIFYFS_REAL(getwd)(path); + return ret; + } +} + +char* UNIFYFS_WRAP(get_current_dir_name)(void) +{ + /* if we're initialized, we're tracking the current working dir */ + if (unifyfs_initialized) { + /* check that we have a string, return unusual error + * in case we don't */ + if (unifyfs_cwd == NULL) { + errno = EACCES; + return NULL; + } + + /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + * Otherwise, call real get_current_dir_name, and if actual cwd does + * not match what we expect, throw an error (the user somehow + * changed dir without us noticing, so there is a bug here) */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + /* supposed to allocate a copy of the current working dir + * and return that to caller, to be freed by caller */ + char* ret = strdup(unifyfs_cwd); + if (ret == NULL) { + errno = ENOMEM; + } + return ret; + } else { + /* current working dir is in real file system, + * fall through to real get_current_dir_name call */ + MAP_OR_FAIL(get_current_dir_name); + char* ret = UNIFYFS_REAL(get_current_dir_name)(); + + /* check that current working dir is what we think + * it should be as a sanity check */ + if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + LOGERR("Expcted cwd=%s vs actual=%s", + unifyfs_cwd, ret); + } + + return ret; + } + } else { + /* not initialized, so fall through to real get_current_dir_name */ + MAP_OR_FAIL(get_current_dir_name); + char* ret = UNIFYFS_REAL(get_current_dir_name)(); + return ret; + } +} + int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) { /* TODO: allow oldpath / newpath to split across memfs and normal * linux fs, which means we'll need to do a read / write */ /* check whether the old path is in our file system */ - if (unifyfs_intercept_path(oldpath)) { + char old_upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(oldpath, old_upath)) { /* for now, we can only rename within our file system */ - if (!unifyfs_intercept_path(newpath)) { + char new_upath[UNIFYFS_MAX_FILENAME]; + if (!unifyfs_intercept_path(newpath, new_upath)) { /* ERROR: can't yet rename across file systems */ errno = EXDEV; return -1; } /* verify that we really have a file by the old name */ - int fid = unifyfs_get_fid_from_path(oldpath); + int fid = unifyfs_get_fid_from_path(old_upath); if (fid < 0) { /* ERROR: oldname does not exist */ - LOGDBG("Couldn't find entry for %s in UNIFYFS", oldpath); + LOGDBG("Couldn't find entry for %s in UNIFYFS", old_upath); errno = ENOENT; return -1; } LOGDBG("orig file in position %d", fid); /* check that new name is within bounds */ - size_t newpathlen = strlen(newpath) + 1; + size_t newpathlen = strlen(new_upath) + 1; if (newpathlen > UNIFYFS_MAX_FILENAME) { errno = ENAMETOOLONG; return -1; @@ -194,7 +497,7 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) /* TODO: rename should replace existing file atomically */ /* verify that we don't already have a file by the new name */ - int newfid = unifyfs_get_fid_from_path(newpath); + int newfid = unifyfs_get_fid_from_path(new_upath); if (newfid >= 0) { /* something exists in newpath, need to delete it */ int ret = UNIFYFS_WRAP(unlink)(newpath); @@ -207,14 +510,15 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) /* finally overwrite the old name with the new name */ LOGDBG("Changing %s to %s", - (char*)&unifyfs_filelist[fid].filename, newpath); - strcpy((void*)&unifyfs_filelist[fid].filename, newpath); + (char*)&unifyfs_filelist[fid].filename, new_upath); + strcpy((void*)&unifyfs_filelist[fid].filename, new_upath); /* success */ return 0; } else { /* for now, we can only rename within our file system */ - if (unifyfs_intercept_path(newpath)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(newpath, upath)) { /* ERROR: can't yet rename across file systems */ errno = EXDEV; return -1; @@ -230,22 +534,26 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) int UNIFYFS_WRAP(truncate)(const char* path, off_t length) { /* determine whether we should intercept this path or not */ - if (unifyfs_intercept_path(path)) { - /* lookup the fid for the path */ - int fid = unifyfs_get_fid_from_path(path); - if (fid < 0) { - /* ERROR: file does not exist */ - LOGDBG("Couldn't find entry for %s in UNIFYFS", path); - errno = ENOENT; - return -1; - } - - /* truncate the file */ - int rc = unifyfs_fid_truncate(fid, length); - if (rc != UNIFYFS_SUCCESS) { - LOGDBG("unifyfs_fid_truncate failed for %s in UNIFYFS", path); - errno = EIO; - return -1; + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + /* get file id for path name */ + int fid = unifyfs_get_fid_from_path(upath); + if (fid >= 0) { + /* got the file locally, use fid_truncate the file */ + int rc = unifyfs_fid_truncate(fid, length); + if (rc != UNIFYFS_SUCCESS) { + errno = unifyfs_rc_errno(rc); + return -1; + } + } else { + /* invoke truncate rpc */ + int gfid = unifyfs_generate_gfid(upath); + int rc = invoke_client_truncate_rpc(gfid, length); + if (rc != UNIFYFS_SUCCESS) { + LOGDBG("truncate rpc failed %s in UNIFYFS", upath); + errno = unifyfs_rc_errno(rc); + return -1; + } } /* success */ @@ -260,12 +568,13 @@ int UNIFYFS_WRAP(truncate)(const char* path, off_t length) int UNIFYFS_WRAP(unlink)(const char* path) { /* determine whether we should intercept this path or not */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* get file id for path name */ - int fid = unifyfs_get_fid_from_path(path); + int fid = unifyfs_get_fid_from_path(upath); if (fid < 0) { /* ERROR: file does not exist */ - LOGDBG("Couldn't find entry for %s in UNIFYFS", path); + LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); errno = ENOENT; return -1; } @@ -273,7 +582,7 @@ int UNIFYFS_WRAP(unlink)(const char* path) /* check that it's not a directory */ if (unifyfs_fid_is_dir(fid)) { /* ERROR: is a directory */ - LOGDBG("Attempting to unlink a directory %s in UNIFYFS", path); + LOGDBG("Attempting to unlink a directory %s in UNIFYFS", upath); errno = EISDIR; return -1; } @@ -281,7 +590,7 @@ int UNIFYFS_WRAP(unlink)(const char* path) /* delete the file */ int ret = unifyfs_fid_unlink(fid); if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(ret); + errno = unifyfs_rc_errno(ret); return -1; } @@ -297,12 +606,13 @@ int UNIFYFS_WRAP(unlink)(const char* path) int UNIFYFS_WRAP(remove)(const char* path) { /* determine whether we should intercept this path or not */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* get file id for path name */ - int fid = unifyfs_get_fid_from_path(path); + int fid = unifyfs_get_fid_from_path(upath); if (fid < 0) { /* ERROR: file does not exist */ - LOGDBG("Couldn't find entry for %s in UNIFYFS", path); + LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); errno = ENOENT; return -1; } @@ -311,7 +621,7 @@ int UNIFYFS_WRAP(remove)(const char* path) if (unifyfs_fid_is_dir(fid)) { /* TODO: shall be equivalent to rmdir(path) */ /* ERROR: is a directory */ - LOGDBG("Attempting to remove a directory %s in UNIFYFS", path); + LOGDBG("Attempting to remove a directory %s in UNIFYFS", upath); errno = EISDIR; return -1; } @@ -320,7 +630,7 @@ int UNIFYFS_WRAP(remove)(const char* path) /* delete the file */ int ret = unifyfs_fid_unlink(fid); if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(ret); + errno = unifyfs_rc_errno(ret); return -1; } @@ -333,29 +643,90 @@ int UNIFYFS_WRAP(remove)(const char* path) } } -int UNIFYFS_WRAP(stat)(const char* path, struct stat* buf) +/* Get global file meta data with accurate file size */ +static int unifyfs_get_meta_with_size(int gfid, unifyfs_file_attr_t* pfattr) { - LOGDBG("stat was called for %s", path); + /* lookup global meta data for this file */ + int ret = unifyfs_get_global_file_meta(gfid, pfattr); + if (ret != UNIFYFS_SUCCESS) { + LOGDBG("get metadata rpc failed"); + return ret; + } - if (unifyfs_intercept_path(path)) { - /* check that caller gave us a buffer to write to */ - if (!buf) { - errno = EFAULT; - return -1; + /* if file is laminated, we assume the file size in the meta + * data is already accurate, if not, look up the current file + * size with an rpc */ + if (!pfattr->is_laminated) { + /* lookup current global file size */ + size_t filesize; + ret = invoke_client_filesize_rpc(gfid, &filesize); + if (ret == UNIFYFS_SUCCESS) { + /* success, we have a file size value */ + pfattr->size = (uint64_t) filesize; + } else { + /* failed to get file size for some reason */ + LOGDBG("filesize rpc failed"); + return ret; } + } - /* get global file id for path */ - int gfid = unifyfs_generate_gfid(path); + return UNIFYFS_SUCCESS; +} - /* lookup stat info for global file id */ - int ret = unifyfs_gfid_stat(gfid, buf); - if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(ret); +/* The main stat call for all the *stat() functions */ +static int __stat(const char* path, struct stat* buf) +{ + /* check that caller gave us a buffer to write to */ + if (!buf) { + /* forgot buffer for stat */ + LOGDBG("invalid stat buffer"); + errno = EINVAL; + return -1; + } + + /* flush any pending writes if needed */ + int fid = unifyfs_get_fid_from_path(path); + if (fid != -1) { + int sync_rc = unifyfs_fid_sync(fid); + if (sync_rc != UNIFYFS_SUCCESS) { + errno = unifyfs_rc_errno(sync_rc); return -1; } + } - /* success */ - return 0; + /* clear the user buffer */ + memset(buf, 0, sizeof(*buf)); + + /* get global file id for given path */ + int gfid = unifyfs_generate_gfid(path); + + /* get stat information for file */ + unifyfs_file_attr_t fattr; + memset(&fattr, 0, sizeof(fattr)); + int ret = unifyfs_get_meta_with_size(gfid, &fattr); + if (ret != UNIFYFS_SUCCESS) { + errno = unifyfs_rc_errno(ret); + return -1; + } + + /* update local file metadata (if applicable) */ + if (fid != -1) { + unifyfs_fid_update_file_meta(fid, &fattr); + } + + /* copy attributes to stat struct */ + unifyfs_file_attr_to_stat(&fattr, buf); + + return 0; +} + +int UNIFYFS_WRAP(stat)(const char* path, struct stat* buf) +{ + LOGDBG("stat was called for %s", path); + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + int ret = __stat(upath, buf); + return ret; } else { MAP_OR_FAIL(stat); int ret = UNIFYFS_REAL(stat)(path, buf); @@ -363,41 +734,22 @@ int UNIFYFS_WRAP(stat)(const char* path, struct stat* buf) } } -#if 0 int UNIFYFS_WRAP(fstat)(int fd, struct stat* buf) { LOGDBG("fstat was called for fd: %d", fd); /* check whether we should intercept this file descriptor */ if (unifyfs_intercept_fd(&fd)) { - if (!buf) { - errno = EFAULT; - return -1; - } - - /* get the file id for this file descriptor */ int fid = unifyfs_get_fid_from_fd(fd); - if (fid < 0) { - errno = EBADF; - return -1; - } - - /* lookup stat info for this file id */ - int ret = unifyfs_fid_stat(fid, buf); - if (ret < 0) { - errno = unifyfs_err_map_to_errno(ret); - return -1; - } - - /* success */ - return 0; + const char* path = unifyfs_path_from_fid(fid); + int ret = __stat(path, buf); + return ret; } else { MAP_OR_FAIL(fstat); int ret = UNIFYFS_REAL(fstat)(fd, buf); return ret; } } -#endif /* * NOTE on __xstat(2), __lxstat(2), and __fxstat(2) @@ -408,74 +760,49 @@ int UNIFYFS_WRAP(fstat)(int fd, struct stat* buf) * instead of using the absolute value 3. */ +#ifdef HAVE___XSTAT int UNIFYFS_WRAP(__xstat)(int vers, const char* path, struct stat* buf) { LOGDBG("xstat was called for %s", path); - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { if (vers != _STAT_VER) { errno = EINVAL; return -1; } - - if (!buf) { - errno = EFAULT; - return -1; - } - - /* get global file id for path */ - int gfid = unifyfs_generate_gfid(path); - - /* lookup stat info for global file id */ - int ret = unifyfs_gfid_stat(gfid, buf); - if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(ret); - return -1; - } - - /* success */ - return 0; + int ret = __stat(upath, buf); + return ret; } else { MAP_OR_FAIL(__xstat); int ret = UNIFYFS_REAL(__xstat)(vers, path, buf); return ret; } } +#endif +#ifdef HAVE___LXSTAT int UNIFYFS_WRAP(__lxstat)(int vers, const char* path, struct stat* buf) { LOGDBG("lxstat was called for %s", path); - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { if (vers != _STAT_VER) { errno = EINVAL; return -1; } - - if (!buf) { - errno = EFAULT; - return -1; - } - - /* get global file id for path */ - int gfid = unifyfs_generate_gfid(path); - - /* lookup stat info for global file id */ - int ret = unifyfs_gfid_stat(gfid, buf); - if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(ret); - return -1; - } - - /* success */ - return 0; + int ret = __stat(upath, buf); + return ret; } else { MAP_OR_FAIL(__lxstat); int ret = UNIFYFS_REAL(__lxstat)(vers, path, buf); return ret; } } +#endif +#ifdef HAVE___FXSTAT int UNIFYFS_WRAP(__fxstat)(int vers, int fd, struct stat* buf) { LOGDBG("fxstat was called for fd %d", fd); @@ -487,209 +814,247 @@ int UNIFYFS_WRAP(__fxstat)(int vers, int fd, struct stat* buf) return -1; } - if (!buf) { - errno = EINVAL; - return -1; - } - - /* get the file id for this file descriptor */ int fid = unifyfs_get_fid_from_fd(fd); - if (fid < 0) { - errno = EBADF; - return -1; - } - - /* lookup stat info for this file id */ - int ret = unifyfs_fid_stat(fid, buf); - if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(ret); - return -1; - } - - /* success */ - return 0; + const char* path = unifyfs_path_from_fid(fid); + int ret = __stat(path, buf); + return ret; } else { MAP_OR_FAIL(__fxstat); int ret = UNIFYFS_REAL(__fxstat)(vers, fd, buf); return ret; } } +#endif + + +#ifdef HAVE_SYS_STATFS_H + +/* tmpfs seems like a safe choice for something like UnifyFS */ +#ifndef TMPFS_MAGIC +#define TMPFS_MAGIC 0x01021994 +#endif + +static int unifyfs_statfs(struct statfs* fsbuf) +{ + if (NULL != fsbuf) { + memset(fsbuf, 0, sizeof(*fsbuf)); + + fsbuf->f_type = TMPFS_MAGIC; /* File system type */ + fsbuf->f_bsize = UNIFYFS_LOGIO_CHUNK_SIZE; /* Optimal block size */ + //fsbuf->f_blocks = ??; /* Total data blocks in filesystem */ + //fsbuf->f_bfree = ??; /* Free blocks in filesystem */ + //fsbuf->f_bavail = ??; /* Free blocks available */ + fsbuf->f_files = unifyfs_max_files; /* Total file nodes */ + //fsbuf->f_ffree = ??; /* Free file nodes in filesystem */ + fsbuf->f_namelen = UNIFYFS_MAX_FILENAME; /* Max filename length */ + return 0; + } else { + return EFAULT; + } +} + +#ifdef HAVE_STATFS +int UNIFYFS_WRAP(statfs)(const char* path, struct statfs* fsbuf) +{ + LOGDBG("statfs() was called for %s", path); + + int ret; + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + ret = unifyfs_statfs(fsbuf); + if (ret) { + errno = ret; + ret = -1; + } + } else { + MAP_OR_FAIL(statfs); + ret = UNIFYFS_REAL(statfs)(path, fsbuf); + } + return ret; +} +#endif + +#ifdef HAVE_FSTATFS +int UNIFYFS_WRAP(fstatfs)(int fd, struct statfs* fsbuf) +{ + LOGDBG("fstatfs() was called for fd: %d", fd); + + /* check whether we should intercept this file descriptor */ + int ret; + if (unifyfs_intercept_fd(&fd)) { + ret = unifyfs_statfs(fsbuf); + if (ret) { + errno = ret; + ret = -1; + } + } else { + MAP_OR_FAIL(fstatfs); + ret = UNIFYFS_REAL(fstatfs)(fd, fsbuf); + } + return ret; +} +#endif + +#endif /* HAVE_SYS_STATFS_H */ /* --------------------------------------- * POSIX wrappers: file descriptors * --------------------------------------- */ -/* read count bytes info buf from file starting at offset pos, - * returns number of bytes actually read in retcount, - * retcount will be less than count only if an error occurs - * or end of file is reached */ -int unifyfs_fd_read(int fd, off_t pos, void* buf, size_t count, - size_t* retcount) +/* + * Read 'count' bytes info 'buf' from file starting at offset 'pos'. + * + * Returns success or error code. + */ +int unifyfs_fd_read(int fd, off_t pos, void* buf, size_t count, size_t* nread) { + /* assume we'll fail, set bytes read to 0 as a clue */ + *nread = 0; + /* get the file id for this file descriptor */ int fid = unifyfs_get_fid_from_fd(fd); if (fid < 0) { - return UNIFYFS_ERROR_BADF; + return EBADF; } /* it's an error to read from a directory */ if (unifyfs_fid_is_dir(fid)) { /* TODO: note that read/pread can return this, but not fread */ - return UNIFYFS_ERROR_ISDIR; + return EISDIR; } /* check that file descriptor is open for read */ unifyfs_fd_t* filedesc = unifyfs_get_filedesc_from_fd(fd); if (!filedesc->read) { - return UNIFYFS_ERROR_BADF; + return EBADF; } /* TODO: is it safe to assume that off_t is bigger than size_t? */ /* check that we don't overflow the file length */ if (unifyfs_would_overflow_offt(pos, (off_t) count)) { - return UNIFYFS_ERROR_OVERFLOW; - } - - /* TODO: check that file is open for reading */ - - /* check that we don't try to read past the end of the file */ - off_t lastread = pos + (off_t) count; - off_t filesize = unifyfs_fid_size(fid); - if (filesize < lastread) { - /* adjust count so we don't read past end of file */ - if (filesize > pos) { - /* read all bytes until end of file */ - count = (size_t)(filesize - pos); - } else { - /* pos is already at or past the end of the file */ - count = 0; - } + return EOVERFLOW; } - /* record number of bytes that we'll actually read */ - *retcount = count; - /* if we don't read any bytes, return success */ if (count == 0) { + LOGDBG("returning EOF"); return UNIFYFS_SUCCESS; } - /* read data from file */ - int read_rc = unifyfs_fid_read(fid, pos, buf, count); - return read_rc; + /* TODO: handle error if sync fails? */ + /* sync data for file before reading, if needed */ + unifyfs_fid_sync(fid); + + /* fill in read request */ + read_req_t req; + req.gfid = unifyfs_gfid_from_fid(fid); + req.offset = (size_t) pos; + req.length = count; + req.nread = 0; + req.errcode = EINPROGRESS; + req.buf = buf; + + /* execute read operation */ + int ret = unifyfs_gfid_read_reqs(&req, 1); + if (ret != UNIFYFS_SUCCESS) { + /* failed to issue read operation */ + return ret; + } else if (req.errcode != UNIFYFS_SUCCESS) { + /* read executed, but failed */ + return req.errcode; + } + + /* success, get number of bytes read from read request field */ + *nread = req.nread; + + return UNIFYFS_SUCCESS; } -/* write count bytes from buf into file starting at offset pos, - * allocates new bytes and updates file size as necessary, - * fills any gaps with zeros */ -int unifyfs_fd_write(int fd, off_t pos, const void* buf, size_t count) +/* + * Write 'count' bytes from 'buf' into file starting at offset' pos'. + * Allocates new bytes and updates file size as necessary. It is assumed + * that 'pos' is actually where you want to write, and so O_APPEND behavior + * is ignored. Fills any gaps with zeros + */ +int unifyfs_fd_write(int fd, off_t pos, const void* buf, size_t count, + size_t* nwritten) { + /* assume we'll fail, set bytes written to 0 as a clue */ + *nwritten = 0; + /* get the file id for this file descriptor */ int fid = unifyfs_get_fid_from_fd(fd); if (fid < 0) { - return UNIFYFS_ERROR_BADF; + return EBADF; } /* it's an error to write to a directory */ if (unifyfs_fid_is_dir(fid)) { - return UNIFYFS_ERROR_INVAL; + return EINVAL; } /* check that file descriptor is open for write */ unifyfs_fd_t* filedesc = unifyfs_get_filedesc_from_fd(fd); if (!filedesc->write) { - return UNIFYFS_ERROR_BADF; + return EBADF; } /* TODO: is it safe to assume that off_t is bigger than size_t? */ /* check that our write won't overflow the length */ if (unifyfs_would_overflow_offt(pos, (off_t) count)) { /* TODO: want to return EFBIG here for streams */ - return UNIFYFS_ERROR_OVERFLOW; - } - - /* TODO: check that file is open for writing */ - - /* get current file size before extending the file */ - off_t filesize = unifyfs_fid_size(fid); - - /* compute new position based on storage type */ - off_t newpos; - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - /* extend file size and allocate chunks if needed */ - newpos = pos + (off_t) count; - int extend_rc = unifyfs_fid_extend(fid, newpos); - if (extend_rc != UNIFYFS_SUCCESS) { - return extend_rc; - } - - /* fill any new bytes between old size and pos with zero values */ - if (filesize < pos) { - off_t gap_size = pos - filesize; - int zero_rc = unifyfs_fid_write_zero(fid, filesize, gap_size); - if (zero_rc != UNIFYFS_SUCCESS) { - return zero_rc; - } - } - } else if (meta->storage == FILE_STORAGE_LOGIO) { - newpos = filesize + (off_t)count; - int extend_rc = unifyfs_fid_extend(fid, newpos); - if (extend_rc != UNIFYFS_SUCCESS) { - return extend_rc; - } - } else { - return UNIFYFS_ERROR_IO; + return EOVERFLOW; } /* finally write specified data to file */ - int write_rc = unifyfs_fid_write(fid, pos, buf, count); - if (write_rc == 0) { - meta->needs_sync = 1; - if (meta->storage == FILE_STORAGE_LOGIO) { - meta->size = newpos; - meta->log_size = pos + count; - } - } + int write_rc = unifyfs_fid_write(fid, pos, buf, count, nwritten); return write_rc; } -int UNIFYFS_WRAP(creat)(const char* path, mode_t mode) +static int unifyfs_create(char* upath, mode_t mode) { /* equivalent to open(path, O_WRONLY|O_CREAT|O_TRUNC, mode) */ - /* check whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { - /* TODO: handle relative paths using current working directory */ + /* create the file */ + int fid; + int flags = O_WRONLY | O_CREAT | O_TRUNC; + off_t pos; + int rc = unifyfs_fid_open(upath, flags, mode, &fid, &pos); + if (rc != UNIFYFS_SUCCESS) { + errno = unifyfs_rc_errno(rc); + return -1; + } - /* create the file */ - int fid; - off_t pos; - int rc = unifyfs_fid_open(path, O_WRONLY | O_CREAT | O_TRUNC, mode, &fid, &pos); - if (rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(rc); - return -1; - } + /* allocate a free file descriptor value */ + int fd = unifyfs_stack_pop(unifyfs_fd_stack); + if (fd < 0) { + /* ran out of file descriptors */ + errno = EMFILE; + return -1; + } - /* allocate a free file descriptor value */ - int fd = unifyfs_stack_pop(unifyfs_fd_stack); - if (fd < 0) { - /* ran out of file descriptors */ - errno = EMFILE; - return -1; - } + /* set file id and file pointer, flags include O_WRONLY */ + unifyfs_fd_t* filedesc = unifyfs_get_filedesc_from_fd(fd); + filedesc->fid = fid; + filedesc->pos = pos; + filedesc->read = 0; + filedesc->write = 1; - /* set file id and file pointer, flags include O_WRONLY */ - unifyfs_fd_t* filedesc = unifyfs_get_filedesc_from_fd(fd); - filedesc->fid = fid; - filedesc->pos = pos; - filedesc->read = 0; - filedesc->write = 1; - LOGDBG("UNIFYFS_open generated fd %d for file %s", fd, path); - /* don't conflict with active system fds that range from 0 - (fd_limit) */ - int ret = fd + unifyfs_fd_limit; - return ret; + /* don't conflict with active system fds that range from 0 - (fd_limit) */ + int ret = fd + unifyfs_fd_limit; + LOGDBG("using fds (internal=%d, external=%d) for fid %d file %s", + fd, ret, fid, upath); + return ret; +} + +int UNIFYFS_WRAP(creat)(const char* path, mode_t mode) +{ + /* check whether we should intercept this path */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + /* TODO: handle relative paths using current working directory */ + return unifyfs_create(upath, mode); } else { MAP_OR_FAIL(creat); int ret = UNIFYFS_REAL(creat)(path, mode); @@ -700,12 +1065,10 @@ int UNIFYFS_WRAP(creat)(const char* path, mode_t mode) int UNIFYFS_WRAP(creat64)(const char* path, mode_t mode) { /* check whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { - /* ERROR: fn not yet supported */ - fprintf(stderr, "Function not yet supported @ %s:%d\n", - __FILE__, __LINE__); - errno = ENOTSUP; - return -1; + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + /* TODO: handle relative paths using current working directory */ + return unifyfs_create(upath, mode); } else { MAP_OR_FAIL(creat64); int ret = UNIFYFS_REAL(creat64)(path, mode); @@ -726,15 +1089,16 @@ int UNIFYFS_WRAP(open)(const char* path, int flags, ...) /* determine whether we should intercept this path */ int ret; - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* TODO: handle relative paths using current working directory */ /* create the file */ int fid; off_t pos; - int rc = unifyfs_fid_open(path, flags, mode, &fid, &pos); + int rc = unifyfs_fid_open(upath, flags, mode, &fid, &pos); if (rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(rc); + errno = unifyfs_rc_errno(rc); return -1; } @@ -754,10 +1118,12 @@ int UNIFYFS_WRAP(open)(const char* path, int flags, ...) || ((flags & O_RDWR) == O_RDWR); filedesc->write = ((flags & O_WRONLY) == O_WRONLY) || ((flags & O_RDWR) == O_RDWR); - LOGDBG("UNIFYFS_open generated fd %d for file %s", fd, path); + filedesc->append = (flags & O_APPEND); /* don't conflict with active system fds that range from 0 - (fd_limit) */ ret = fd + unifyfs_fd_limit; + LOGDBG("using fds (internal=%d, external=%d) for fid %d file %s", + fd, ret, fid, upath); return ret; } else { MAP_OR_FAIL(open); @@ -770,6 +1136,7 @@ int UNIFYFS_WRAP(open)(const char* path, int flags, ...) } } +#ifdef HAVE_OPEN64 int UNIFYFS_WRAP(open64)(const char* path, int flags, ...) { /* if O_CREAT is set, we should also have some mode flags */ @@ -783,7 +1150,8 @@ int UNIFYFS_WRAP(open64)(const char* path, int flags, ...) /* check whether we should intercept this path */ int ret; - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* Call open wrapper with LARGEFILE flag set*/ if (flags & O_CREAT) { ret = UNIFYFS_WRAP(open)(path, flags | O_LARGEFILE, mode); @@ -801,6 +1169,7 @@ int UNIFYFS_WRAP(open64)(const char* path, int flags, ...) return ret; } +#endif int UNIFYFS_WRAP(__open_2)(const char* path, int flags, ...) { @@ -818,8 +1187,9 @@ int UNIFYFS_WRAP(__open_2)(const char* path, int flags, ...) } /* check whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { - LOGDBG("__open_2 was intercepted for path %s", path); + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + LOGDBG("__open_2 was intercepted for path %s", upath); /* Call open wrapper */ if (flags & O_CREAT) { @@ -866,29 +1236,65 @@ off_t UNIFYFS_WRAP(lseek)(int fd, off_t offset, int whence) /* get current file position */ off_t current_pos = filedesc->pos; - - /* TODO: support SEEK_DATA and SEEK_HOLE? */ + off_t logical_eof; /* compute final file position */ - LOGDBG("seeking from %ld", current_pos); switch (whence) { case SEEK_SET: /* seek to offset */ + if (offset < 0) { + /* negative offset is invalid */ + errno = EINVAL; + return (off_t)(-1); + } current_pos = offset; break; case SEEK_CUR: /* seek to current position + offset */ + if (current_pos + offset < 0) { + /* offset is negative and will result in negative position */ + errno = EINVAL; + return (off_t)(-1); + } current_pos += offset; break; case SEEK_END: /* seek to EOF + offset */ - current_pos = meta->size + offset; + logical_eof = unifyfs_fid_logical_size(fid); + LOGDBG("fid=%d EOF is offset %zu", fid, (size_t)logical_eof); + if (logical_eof + offset < 0) { + /* offset is negative and will result in negative position */ + errno = EINVAL; + return (off_t)(-1); + } + current_pos = logical_eof + offset; + break; + case SEEK_DATA: + /* Using fallback approach: always return offset */ + logical_eof = unifyfs_fid_logical_size(fid); + LOGDBG("fid=%d EOF is offset %zu", fid, (size_t)logical_eof); + if (offset < 0 || offset > logical_eof) { + /* negative offset and offset beyond EOF are invalid */ + errno = ENXIO; + return (off_t)(-1); + } + current_pos = offset; + break; + case SEEK_HOLE: + /* Using fallback approach: always return offset for EOF */ + logical_eof = unifyfs_fid_logical_size(fid); + LOGDBG("fid=%d EOF is offset %zu", fid, (size_t)logical_eof); + if (offset < 0 || offset > logical_eof) { + /* negative offset and offset beyond EOF are invalid */ + errno = ENXIO; + return (off_t)(-1); + } + current_pos = logical_eof; break; default: errno = EINVAL; return (off_t)(-1); } - LOGDBG("seeking to %ld", current_pos); /* set and return final file position */ filedesc->pos = current_pos; @@ -907,9 +1313,9 @@ off64_t UNIFYFS_WRAP(lseek64)(int fd, off64_t offset, int whence) if (unifyfs_intercept_fd(&fd)) { if (sizeof(off_t) == sizeof(off64_t)) { /* off_t and off64_t are the same size, - * delegate to lseek warpper */ + * delegate to lseek wrapper */ off64_t ret = (off64_t)UNIFYFS_WRAP(lseek)( - origfd, (off_t) offset, whence); + origfd, (off_t) offset, whence); return ret; } else { /* ERROR: fn not yet supported */ @@ -925,6 +1331,7 @@ off64_t UNIFYFS_WRAP(lseek64)(int fd, off64_t offset, int whence) } } +#ifdef HAVE_POSIX_FADVISE int UNIFYFS_WRAP(posix_fadvise)(int fd, off_t offset, off_t len, int advice) { /* check whether we should intercept this file descriptor */ @@ -973,6 +1380,7 @@ int UNIFYFS_WRAP(posix_fadvise)(int fd, off_t offset, off_t len, int advice) return ret; } } +#endif ssize_t UNIFYFS_WRAP(read)(int fd, void* buf, size_t count) { @@ -994,54 +1402,20 @@ ssize_t UNIFYFS_WRAP(read)(int fd, void* buf, size_t count) return (ssize_t)(-1); } -#if 0 // THIS IS BROKEN UNTIL WE HAVE GLOBAL SIZE - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta == NULL) { - /* ERROR: invalid file descriptor */ - errno = EBADF; + /* execute read */ + size_t bytes; + int read_rc = unifyfs_fd_read(fd, filedesc->pos, buf, count, &bytes); + if (read_rc != UNIFYFS_SUCCESS) { + /* read operation failed */ + errno = unifyfs_rc_errno(read_rc); return (ssize_t)(-1); } - /* check for end of file */ - if (filedesc->pos >= meta->size) { - return 0; /* EOF */ - } -#endif - - /* assume we'll succeed in read */ - size_t retcount = count; - - read_req_t tmp_req; - tmp_req.fid = fid; - tmp_req.offset = (size_t) filedesc->pos; - tmp_req.length = count; - tmp_req.errcode = UNIFYFS_SUCCESS; - tmp_req.buf = buf; - - /* - * this returns error code, which is zero for successful cases. - */ - int ret = unifyfs_fd_logreadlist(&tmp_req, 1); - - /* - * FIXME: when we can get the global file size correctly, the following - * should be rewritten. currently, we cannot detect EOF reliably. - */ - if (ret != UNIFYFS_SUCCESS) { - if (tmp_req.errcode != UNIFYFS_SUCCESS) { - /* error reading data */ - errno = EIO; - retcount = -1; - } else { - retcount = 0; /* possible EOF */ - } - } else { - /* success, update position */ - filedesc->pos += (off_t) retcount; - } + /* success, update file pointer position */ + filedesc->pos += (off_t)bytes; /* return number of bytes read */ - return (ssize_t) retcount; + return (ssize_t)bytes; } else { MAP_OR_FAIL(read); ssize_t ret = UNIFYFS_REAL(read)(fd, buf, count); @@ -1052,8 +1426,6 @@ ssize_t UNIFYFS_WRAP(read)(int fd, void* buf, size_t count) /* TODO: find right place to msync spillover mapping */ ssize_t UNIFYFS_WRAP(write)(int fd, const void* buf, size_t count) { - ssize_t ret; - LOGDBG("write was called for fd %d", fd); /* check whether we should intercept this file descriptor */ @@ -1062,882 +1434,213 @@ ssize_t UNIFYFS_WRAP(write)(int fd, const void* buf, size_t count) unifyfs_fd_t* filedesc = unifyfs_get_filedesc_from_fd(fd); if (filedesc == NULL) { /* ERROR: invalid file descriptor */ - errno = EBADF; - return (ssize_t)(-1); - } - - /* write data to file */ - int write_rc = unifyfs_fd_write(fd, filedesc->pos, buf, count); - if (write_rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(write_rc); - return (ssize_t)(-1); - } - ret = count; - - /* update file position */ - filedesc->pos += ret; - } else { - MAP_OR_FAIL(write); - ret = UNIFYFS_REAL(write)(fd, buf, count); - } - - return ret; -} - -ssize_t UNIFYFS_WRAP(readv)(int fd, const struct iovec* iov, int iovcnt) -{ - ssize_t ret; - - /* check whether we should intercept this file descriptor */ - if (unifyfs_intercept_fd(&fd)) { - ssize_t rret; - int i; - ret = 0; - for (i = 0; i < iovcnt; i++) { - rret = UNIFYFS_WRAP(read)(fd, (void*)iov[i].iov_base, - iov[i].iov_len); - if (-1 == rret) { - return -1; - } else if (0 == rret) { - return ret; - } else { - ret += rret; - } - } - return ret; - } else { - MAP_OR_FAIL(readv); - ret = UNIFYFS_REAL(readv)(fd, iov, iovcnt); - return ret; - } -} - -ssize_t UNIFYFS_WRAP(writev)(int fd, const struct iovec* iov, int iovcnt) -{ - ssize_t ret; - - /* check whether we should intercept this file descriptor */ - if (unifyfs_intercept_fd(&fd)) { - ssize_t wret; - int i; - ret = 0; - for (i = 0; i < iovcnt; i++) { - wret = UNIFYFS_WRAP(write)(fd, (const void*)iov[i].iov_base, - iov[i].iov_len); - if (-1 == wret) { - return -1; - } else { - ret += wret; - if ((size_t)wret != iov[i].iov_len) { - return ret; - } - } - } - return ret; - } else { - MAP_OR_FAIL(writev); - ret = UNIFYFS_REAL(writev)(fd, iov, iovcnt); - return ret; - } -} - -int UNIFYFS_WRAP(lio_listio)(int mode, struct aiocb* const aiocb_list[], - int nitems, struct sigevent* sevp) -{ - /* TODO - support for LIO_NOWAIT mode */ - - read_req_t* reqs = calloc(nitems, sizeof(read_req_t)); - if (NULL == reqs) { - errno = ENOMEM; // EAGAIN? - return -1; - } - - int ret = 0; - int reqcnt = 0; - int i, fd, fid, ndx, rc; - struct aiocb* cbp; - - for (i = 0; i < nitems; i++) { - cbp = aiocb_list[i]; - fd = cbp->aio_fildes; - switch (cbp->aio_lio_opcode) { - case LIO_WRITE: { - ssize_t wret; - wret = UNIFYFS_WRAP(pwrite)(fd, (const void*)cbp->aio_buf, - cbp->aio_nbytes, cbp->aio_offset); - if (-1 == wret) { - AIOCB_ERROR_CODE(cbp) = errno; - } else { - AIOCB_ERROR_CODE(cbp) = 0; - AIOCB_RETURN_VAL(cbp) = wret; - } - break; - } - case LIO_READ: { - if (unifyfs_intercept_fd(&fd)) { - /* get local file id for this request */ - fid = unifyfs_get_fid_from_fd(fd); - if (fid < 0) { - AIOCB_ERROR_CODE(cbp) = EINVAL; - } else { - reqs[reqcnt].fid = fid; - reqs[reqcnt].offset = (size_t)(cbp->aio_offset); - reqs[reqcnt].length = cbp->aio_nbytes; - reqs[reqcnt].errcode = EINPROGRESS; - reqs[reqcnt].buf = (char*)(cbp->aio_buf); - reqcnt++; - } - } else { - ssize_t rret; - rret = UNIFYFS_WRAP(pread)(fd, (void*)cbp->aio_buf, - cbp->aio_nbytes, cbp->aio_offset); - if (-1 == rret) { - AIOCB_ERROR_CODE(cbp) = errno; - } else { - AIOCB_ERROR_CODE(cbp) = 0; - AIOCB_RETURN_VAL(cbp) = rret; - } - } - break; - } - default: // LIO_NOP - break; - } - } - - if (reqcnt) { - rc = unifyfs_fd_logreadlist(reqs, reqcnt); - if (rc != UNIFYFS_SUCCESS) { - /* error reading data */ - ret = -1; - } - /* update aiocb fields to record error status and return value */ - ndx = 0; - for (i = 0; i < reqcnt; i++) { - char* buf = reqs[i].buf; - for (; ndx < nitems; ndx++) { - cbp = aiocb_list[ndx]; - if ((char*)(cbp->aio_buf) == buf) { - AIOCB_ERROR_CODE(cbp) = reqs[i].errcode; - if (0 == reqs[i].errcode) { - AIOCB_RETURN_VAL(cbp) = reqs[i].length; - } - break; // continue outer loop - } - } - } - } - - free(reqs); - - if (-1 == ret) { - errno = EIO; - } - return ret; -} - -/* order by file id then by file position */ -static int compare_index_entry(const void* a, const void* b) -{ - const unifyfs_index_t* ptr_a = a; - const unifyfs_index_t* ptr_b = b; - - if (ptr_a->fid != ptr_b->fid) { - if (ptr_a->fid < ptr_b->fid) { - return -1; - } else { - return 1; - } - } - - if (ptr_a->file_pos == ptr_b->file_pos) { - return 0; - } else if (ptr_a->file_pos < ptr_b->file_pos) { - return -1; - } else { - return 1; - } -} - -/* order by file id then by offset */ -static int compare_read_req(const void* a, const void* b) -{ - const read_req_t* ptr_a = a; - const read_req_t* ptr_b = b; - - if (ptr_a->fid != ptr_b->fid) { - if (ptr_a->fid < ptr_b->fid) { - return -1; - } else { - return 1; - } - } - - if (ptr_a->offset == ptr_b->offset) { - return 0; - } else if (ptr_a->offset < ptr_b->offset) { - return -1; - } else { - return 1; - } -} - -/* returns index into read_req of item whose offset is - * just below offset of target item (if one exists) */ -static int unifyfs_locate_req(read_req_t* read_reqs, int count, - read_req_t* match_req) -{ - /* if list is empty, indicate that there is valid starting request */ - if (count == 0) { - return -1; - } - - /* if we only have one item, return its index */ - if (count == 1) { - return 0; - } - - /* if we have two items, return index to item that must come before */ - if (count == 2) { - if (compare_read_req(match_req, &read_reqs[1]) < 0) { - /* second item is clearly bigger, so try first */ - return 0; - } - - /* second item is less than or equal to target */ - return 1; - } - - /* execute binary search comparing target to list of requests */ - - int left = 0; - int right = count - 1; - int mid = (left + right) / 2; - - /* binary search until we find an exact match or have cut the list - * to just two items */ - int cmp; - while ((left + 1) < right) { - cmp = compare_read_req(match_req, &read_reqs[mid]); - if (cmp == 0) { - /* found exact match */ - return mid; - } else if (cmp > 0) { - /* if target if bigger than mid, set left bound to mid */ - left = mid; - } else { - /* if target is smaller than mid, set right bounds to mid */ - right = mid; - } - - /* update middle index */ - mid = (left + right) / 2; - } - - /* got two items, let's pick one */ - if (compare_read_req(match_req, &read_reqs[left]) < 0) { - /* target is smaller than left item, - * return index to left of left item if we can */ - if (left == 0) { - /* at left most item, so return this index */ - return 0; - } - return left - 1; - } else if (compare_read_req(match_req, &read_reqs[right]) < 0) { - /* target is smaller than right item, - * return index of item one less than right */ - return right - 1; - } else { - /* target is greater or equal to right item */ - return right; - } -} - -/* - * given an read request, split it into multiple indices whose range - * is equal or smaller than slice_range size - * @param cur_read_req: the read request to split - * @param slice_range: the slice size of the key-value store - * @return out_set: the set of split read requests - * */ -static int unifyfs_split_read_requests(read_req_t* req, - read_req_set_t* out_set, - size_t slice_range) -{ - /* compute offset of first and last byte in request */ - size_t req_start = req->offset; - size_t req_end = req->offset + req->length - 1; - - /* compute offset of first and last byte of slice - * that contains first byte of request */ - size_t slice_start = (req->offset / slice_range) * slice_range; - size_t slice_end = slice_start + slice_range - 1; - - /* initialize request count in output set */ - memset(out_set, 0, sizeof(read_req_set_t)); - int count = 0; - - if (req_end <= slice_end) { - /* slice fully contains request - * - * slice_start slice_end - * req_start req_end - * - */ - out_set->read_reqs[count] = *req; - count++; - } else { - /* read request spans multiple slices - * - * slice_start slice_end next_slice_start next_slice_end - * req_start req_end - * - */ - - /* account for leading bytes in read request in first slice */ - out_set->read_reqs[count] = *req; - out_set->read_reqs[count].length = slice_end - req_start + 1; - count++; - - /* account for all middle slices */ - do { - /* advance to next slice */ - slice_start = slice_end + 1; - slice_end = slice_start + slice_range - 1; - - if (req_end <= slice_end) { - /* found the slice that contains end byte in read request */ - break; - } - - /* full slice is contained in read request */ - out_set->read_reqs[count].fid = req->fid; - out_set->read_reqs[count].offset = slice_start; - out_set->read_reqs[count].length = slice_range; - out_set->read_reqs[count].errcode = UNIFYFS_SUCCESS; - count++; - } while (1); - - /* account for bytes in final slice */ - out_set->read_reqs[count].fid = req->fid; - out_set->read_reqs[count].offset = slice_start; - out_set->read_reqs[count].length = req_end - slice_start + 1; - out_set->read_reqs[count].errcode = UNIFYFS_SUCCESS; - count++; - } - - /* set size of output set */ - out_set->count = count; - - return 0; -} - -/* - * coalesce read requests referring to contiguous data within a given - * file id, and split read requests whose size is larger than - * unifyfs_key_slice_range into more requests that are smaller - * - * Note: a series of read requests that have overlapping spans - * will prevent merging of contiguous ranges, this should still - * function, but performance may be lost - * - * @param read_req: a list of read requests - * @param count: number of read requests - * @param tmp_set: a temporary read requests buffer - * to hold the intermediate result - * @param unifyfs_key_slice_range: slice size of distributed - * key-value store - * @return out_set: the coalesced read requests - * - * */ -static int unifyfs_coalesce_read_reqs(read_req_t* read_req, int count, - size_t slice_range, - read_req_set_t* out_set) -{ - read_req_set_t tmp_set; - - /* initialize output and temporary sets */ - out_set->count = 0; - memset(&tmp_set, 0, sizeof(tmp_set)); - - int i; - int out_idx = 0; - for (i = 0; i < count; i++) { - /* index into temp set */ - int tmp_idx = 0; - - /* split this read request into parts based on slice range - * store resulting requests in tmp_set */ - unifyfs_split_read_requests(&read_req[i], &tmp_set, slice_range); - - /* look to merge last item in output set with first item - * in split requests */ - if (out_idx > 0) { - /* get pointer to last item in out_set */ - read_req_t* out_req = &(out_set->read_reqs[out_idx - 1]); - - /* get pointer to first item in tmp_set */ - read_req_t* tmp_req = &(tmp_set.read_reqs[0]); - - /* look to merge these items if they are contiguous */ - if (out_req->fid == tmp_req->fid && - out_req->offset + out_req->length == tmp_req->offset) { - /* refers to contiguous range in the same file, - * coalesce if also in the same slice */ - uint64_t cur_slice = out_req->offset / slice_range; - uint64_t tmp_slice = tmp_req->offset / slice_range; - if (cur_slice == tmp_slice) { - /* just increase length to merge */ - out_req->length += tmp_req->length; - - /* bump offset into tmp set array */ - tmp_idx++; - } - } - } - - /* tack on remaining items from tmp set into output set */ - for (; tmp_idx < tmp_set.count; tmp_idx++) { - out_set->read_reqs[out_idx] = tmp_set.read_reqs[tmp_idx]; - out_set->count++; - out_idx++; - } - } - - return 0; -} - -/* - * match the received read_requests with the - * client's read requests - * @param read_reqs: a list of read requests - * @param count: number of read requests - * @param match_req: received read request to match - * @return error code - * - * */ -static int unifyfs_match_received_ack(read_req_t* read_reqs, int count, - read_req_t* match_req) -{ - /* given fid, offset, and length of match_req that holds read reply, - * identify which read request this belongs to in read_req array, - * then copy data to user buffer */ - - /* create a request corresponding to the first byte in read reply */ - read_req_t match_start = *match_req; - - /* create a request corresponding to last byte in read reply */ - read_req_t match_end = *match_req; - match_end.offset += match_end.length - 1; - - /* find index of read request that contains our first byte */ - int start_pos = unifyfs_locate_req(read_reqs, count, &match_start); - - /* find index of read request that contains our last byte */ - int end_pos = unifyfs_locate_req(read_reqs, count, &match_end); - - /* could not find a valid read request in read_req array */ - if (start_pos == -1) { - return UNIFYFS_FAILURE; - } - - /* s: start of match_req, e: end of match_req */ - - if (start_pos == 0) { - if (compare_read_req(&match_start, &read_reqs[0]) < 0) { - /* starting offset in read reply comes before lowest - * offset in read requests, consider this to be an error - * - * ************ *********** ************* - * s - * - * */ - return UNIFYFS_FAILURE; - } - } - - /* create read request corresponding to first byte of first read request */ - read_req_t first_start = read_reqs[start_pos]; - - /* create read request corresponding to last byte of first read request */ - read_req_t first_end = read_reqs[start_pos]; - first_end.offset += first_end.length - 1; - - /* check whether read reply is fully contained by first read request */ - if (compare_read_req(&match_start, &first_start) >= 0 && - compare_read_req(&match_end, &first_end) <= 0) { - /* read reply is fully contained within first read request - * - * first_s first_e - * ***************** ************* - * s e - * - * */ - - /* copy data to user buffer if no error */ - if (match_req->errcode == UNIFYFS_SUCCESS) { - /* compute buffer location to copy data */ - size_t offset = (size_t)(match_start.offset - first_start.offset); - char* buf = first_start.buf + offset; - - /* copy data to user buffer */ - memcpy(buf, match_req->buf, match_req->length); - - return UNIFYFS_SUCCESS; - } else { - /* hit an error during read, so record this fact - * in user's original read request */ - read_reqs[start_pos].errcode = match_req->errcode; - return UNIFYFS_FAILURE; - } - } - - /* define read request for offset of first byte in last read request */ - read_req_t last_start = read_reqs[end_pos]; - - /* define read request for offset of last byte in last read request */ - read_req_t last_end = read_reqs[end_pos]; - last_end.offset += last_end.length - 1; - - /* determine whether read reply is contained in a range of read requests */ - if (compare_read_req(&match_start, &first_start) >= 0 && - compare_read_req(&match_end, &last_end) <= 0) { - /* read reply spans multiple read requests - * - * first_s first_e req_s req_e req_s req_e last_s last_e - * ***************** *********** *********** **************** - * s e - * - * */ - - /* check that read requests from start_pos to end_pos - * define a contiguous set of bytes */ - int i; - for (i = start_pos + 1; i <= end_pos; i++) { - if ((read_reqs[i - 1].offset + read_reqs[i - 1].length) - != read_reqs[i].offset) { - /* read requests are noncontiguous, error */ - return UNIFYFS_FAILURE; - } + errno = EBADF; + return (ssize_t)(-1); } - /* read requests are contiguous, fill all buffers in middle */ - if (match_req->errcode == UNIFYFS_SUCCESS) { - /* get pointer to start of read reply data */ - char* ptr = match_req->buf; - - /* compute position in user buffer to copy data */ - size_t offset = (size_t)(match_start.offset - first_start.offset); - char* buf = first_start.buf + offset; - - /* compute number of bytes to copy into first read request */ - size_t length = - (size_t)(first_end.offset - match_start.offset + 1); + /* compute starting position to write within file, + * assume at current position on file descriptor */ + off_t pos = filedesc->pos; + if (filedesc->append) { + /* + * With O_APPEND we always write to the end, despite the current + * file position. + */ + int fid = unifyfs_get_fid_from_fd(fd); + pos = unifyfs_fid_logical_size(fid); + } - /* copy data into user buffer for first read request */ - memcpy(buf, ptr, length); - ptr += length; + /* write data to file */ + size_t bytes; + int write_rc = unifyfs_fd_write(fd, pos, buf, count, &bytes); + if (write_rc != UNIFYFS_SUCCESS) { + /* write failed */ + errno = unifyfs_rc_errno(write_rc); + return (ssize_t)(-1); + } - /* copy data for middle read requests */ - for (i = start_pos + 1; i < end_pos; i++) { - memcpy(read_reqs[i].buf, ptr, read_reqs[i].length); - ptr += read_reqs[i].length; - } + /* update file position */ + filedesc->pos = pos + bytes; - /* compute bytes for last read request */ - length = (size_t)(match_end.offset - last_start.offset + 1); + /* return number of bytes written */ + return (ssize_t)bytes; + } else { + MAP_OR_FAIL(write); + ssize_t ret = UNIFYFS_REAL(write)(fd, buf, count); + return ret; + } +} - /* copy data into user buffer for last read request */ - memcpy(last_start.buf, ptr, length); - ptr += length; +ssize_t UNIFYFS_WRAP(readv)(int fd, const struct iovec* iov, int iovcnt) +{ + ssize_t ret; - return UNIFYFS_SUCCESS; - } else { - /* hit an error during read, update errcode in user's - * original read request from start to end inclusive */ - for (i = start_pos; i <= end_pos; i++) { - read_reqs[i].errcode = match_req->errcode; + /* check whether we should intercept this file descriptor */ + int origfd = fd; + if (unifyfs_intercept_fd(&fd)) { + ssize_t rret; + int i; + ret = 0; + for (i = 0; i < iovcnt; i++) { + rret = UNIFYFS_WRAP(read)(origfd, (void*)iov[i].iov_base, + iov[i].iov_len); + if (-1 == rret) { + return -1; + } else if (0 == rret) { + return ret; + } else { + ret += rret; } - return UNIFYFS_FAILURE; } + return ret; + } else { + MAP_OR_FAIL(readv); + ret = UNIFYFS_REAL(readv)(fd, iov, iovcnt); + return ret; } - - /* could not find a matching read request, return an error */ - return UNIFYFS_FAILURE; } -/* notify our delegator that the shared memory buffer - * is now clear and ready to hold more read data */ -static void delegator_signal(void) +ssize_t UNIFYFS_WRAP(writev)(int fd, const struct iovec* iov, int iovcnt) { - LOGDBG("receive buffer now empty"); - - /* set shm flag to signal delegator we're done */ - shm_header_t* hdr = (shm_header_t*)shm_recv_buf; - hdr->state = SHMEM_REGION_EMPTY; - - /* TODO: MEM_FLUSH */ -} + ssize_t ret; -/* wait for delegator to inform us that shared memory buffer - * is filled with read data */ -static int delegator_wait(void) -{ - int rc = (int)UNIFYFS_SUCCESS; - -#if defined(UNIFYFS_USE_DOMAIN_SOCKET) - /* wait for signal on socket */ - cmd_fd.events = POLLIN | POLLPRI; - cmd_fd.revents = 0; - rc = poll(&cmd_fd, 1, -1); - - /* check that we got something good */ - if (rc == 0) { - if (cmd_fd.revents != 0) { - if (cmd_fd.revents == POLLIN) { - return UNIFYFS_SUCCESS; + /* check whether we should intercept this file descriptor */ + int origfd = fd; + if (unifyfs_intercept_fd(&fd)) { + ssize_t wret; + int i; + ret = 0; + for (i = 0; i < iovcnt; i++) { + wret = UNIFYFS_WRAP(write)(origfd, (const void*)iov[i].iov_base, + iov[i].iov_len); + if (-1 == wret) { + return -1; } else { - printf("poll returned %d; error: %s\n", rc, strerror(errno)); + ret += wret; + if ((size_t)wret != iov[i].iov_len) { + return ret; + } } - } else { - printf("poll returned %d; error: %s\n", rc, strerror(errno)); } + return ret; } else { - printf("poll returned %d; error: %s\n", rc, strerror(errno)); - } -#endif - - /* specify time to sleep between checking flag in shared - * memory indicating server has produced */ - struct timespec shm_wait_tm; - shm_wait_tm.tv_sec = 0; - shm_wait_tm.tv_nsec = SHM_WAIT_INTERVAL; - - /* get pointer to flag in shared memory */ - shm_header_t* hdr = (shm_header_t*)shm_recv_buf; - - /* wait for server to set flag to non-zero */ - int max_sleep = 5000000; // 5s - volatile int* vip = (volatile int*)&(hdr->state); - while (*vip == SHMEM_REGION_EMPTY) { - /* not there yet, sleep for a while */ - nanosleep(&shm_wait_tm, NULL); - /* TODO: MEM_FETCH */ - max_sleep--; - if (0 == max_sleep) { - LOGERR("timed out waiting for non-empty"); - rc = (int)UNIFYFS_ERROR_SHMEM; - break; - } + MAP_OR_FAIL(writev); + ret = UNIFYFS_REAL(writev)(fd, iov, iovcnt); + return ret; } - - return rc; } -/* copy read data from shared memory buffer to user buffers from read - * calls, sets done=1 on return when delegator informs us it has no - * more data */ -static int process_read_data(read_req_t* read_reqs, int count, int* done) +#ifdef HAVE_LIO_LISTIO +int UNIFYFS_WRAP(lio_listio)(int mode, struct aiocb* const aiocb_list[], + int nitems, struct sigevent* sevp) { - /* assume we'll succeed */ - int rc = UNIFYFS_SUCCESS; - - /* get pointer to start of shared memory buffer */ - shm_header_t* shm_hdr = (shm_header_t*)shm_recv_buf; - char* shmptr = ((char*)shm_hdr) + sizeof(shm_header_t); + /* TODO - support for LIO_NOWAIT mode */ - size_t num = shm_hdr->meta_cnt; - if (0 == num) { - LOGDBG("no read responses available"); - return rc; + read_req_t* reqs = calloc(nitems, sizeof(read_req_t)); + if (NULL == reqs) { + errno = ENOMEM; // EAGAIN? + return -1; } - /* process each of our replies */ - size_t i; - for (i = 0; i < num; i++) { - /* get pointer to current read reply header */ - shm_meta_t* msg = (shm_meta_t*)shmptr; - shmptr += sizeof(shm_meta_t); - - /* define request object */ - read_req_t req; - req.fid = msg->gfid; - req.offset = msg->offset; - req.length = msg->length; - req.errcode = msg->errcode; + int ret = 0; + int reqcnt = 0; + int i, fd, fid, ndx, rc; + struct aiocb* cbp; - LOGDBG("read reply: gfid=%d offset=%zu size=%zu", - req.fid, req.offset, req.length); + for (i = 0; i < nitems; i++) { + cbp = aiocb_list[i]; + fd = cbp->aio_fildes; - /* get pointer to data */ - req.buf = shmptr; - shmptr += msg->length; + /* LOGDBG("aiocb(fd=%d, op=%d, count=%zu, offset=%zu, buf=%p)", + * cbp->aio_fildes, cbp->aio_lio_opcode, cbp->aio_nbytes, + * cbp->aio_offset, cbp->aio_buf); */ - /* process this read reply, identify which application read - * request this reply goes to and copy data to user buffer */ - int tmp_rc = unifyfs_match_received_ack(read_reqs, count, &req); - if (tmp_rc != UNIFYFS_SUCCESS) { - rc = UNIFYFS_FAILURE; + switch (cbp->aio_lio_opcode) { + case LIO_WRITE: { + ssize_t wret; + wret = UNIFYFS_WRAP(pwrite)(fd, (const void*)cbp->aio_buf, + cbp->aio_nbytes, cbp->aio_offset); + if (-1 == wret) { + AIOCB_ERROR_CODE(cbp) = errno; + } else { + AIOCB_ERROR_CODE(cbp) = 0; + AIOCB_RETURN_VAL(cbp) = wret; + } + break; } - } - - /* set done flag if there is no more data */ - if (shm_hdr->state == SHMEM_REGION_DATA_COMPLETE) { - *done = 1; - } - - return rc; -} + case LIO_READ: { + if (unifyfs_intercept_fd(&fd)) { + /* get local file id for this request */ + fid = unifyfs_get_fid_from_fd(fd); + if (fid < 0) { + AIOCB_ERROR_CODE(cbp) = EINVAL; + } else { + /* TODO: handle error if sync fails? */ + /* sync data for file before reading, if needed */ + unifyfs_fid_sync(fid); -/* - * get data for a list of read requests from the - * delegator - * @param read_reqs: a list of read requests - * @param count: number of read requests - * @return error code - * - * */ -int unifyfs_fd_logreadlist(read_req_t* read_reqs, int count) -{ - int i; - int tot_sz = 0; - int rc = UNIFYFS_SUCCESS; - int num = 0; - int* ptr_size = NULL; - int* ptr_num = NULL; - -#if 0 /* TODO: when meta has correct file size, we can use this code */ - /* Adjust length for fitting the EOF. */ - for (i = 0; i < count; i++) { - /* get pointer to read request */ - read_req_t* req = &read_reqs[i]; - - /* get metadata for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(req->fid); - if (meta == NULL) { - return UNIFYFS_ERROR_BADF; + /* define read request for this file */ + reqs[reqcnt].gfid = unifyfs_gfid_from_fid(fid); + reqs[reqcnt].offset = (size_t)(cbp->aio_offset); + reqs[reqcnt].length = cbp->aio_nbytes; + reqs[reqcnt].nread = 0; + reqs[reqcnt].errcode = EINPROGRESS; + reqs[reqcnt].buf = (char*)(cbp->aio_buf); + reqs[reqcnt].aiocbp = cbp; + reqcnt++; + } + } else { + ssize_t rret; + rret = UNIFYFS_WRAP(pread)(fd, (void*)cbp->aio_buf, + cbp->aio_nbytes, cbp->aio_offset); + if (-1 == rret) { + AIOCB_ERROR_CODE(cbp) = errno; + } else { + AIOCB_ERROR_CODE(cbp) = 0; + AIOCB_RETURN_VAL(cbp) = rret; + } + } + break; } - - /* compute last byte of read request */ - size_t last_offset = req->offset + req->length; - if (last_offset > meta->size) { - /* shorten the request to read just up to end */ - req->length = meta->size - req->offset; + default: // LIO_NOP + LOGDBG("lio_vec[%d] - unexpected LIO op %d", + i, cbp->aio_lio_opcode); + break; } } -#endif - /* - * Todo: When the number of read requests exceed the - * request buffer, split list io into multiple bulk - * sends and transfer in bulks - * */ - - /* convert local fid to global fid */ - unifyfs_file_attr_t tmp_meta_entry; - unifyfs_file_attr_t* ptr_meta_entry; - for (i = 0; i < count; i++) { - /* look for global meta data for this local file id */ - tmp_meta_entry.fid = read_reqs[i].fid; - ptr_meta_entry = - (unifyfs_file_attr_t*) bsearch(&tmp_meta_entry, - unifyfs_fattrs.meta_entry, - *unifyfs_fattrs.ptr_num_entries, - sizeof(unifyfs_file_attr_t), - compare_fattr); - - /* replace local file id with global file id in request */ - if (ptr_meta_entry != NULL) { - read_reqs[i].fid = ptr_meta_entry->gfid; - } else { - /* failed to find gfid for this request */ - return UNIFYFS_ERROR_BADF; + if (reqcnt) { + rc = unifyfs_gfid_read_reqs(reqs, reqcnt); + if (rc != UNIFYFS_SUCCESS) { + /* error reading data */ + ret = rc; } - } - - /* order read request by increasing file id, then increasing offset */ - qsort(read_reqs, count, sizeof(read_req_t), compare_read_req); - /* coalesce the contiguous read requests */ - read_req_set_t read_req_set; - unifyfs_coalesce_read_reqs(read_reqs, count, - unifyfs_key_slice_range, - &read_req_set); - - /* prepare our shared memory buffer for delegator */ - delegator_signal(); - - int read_rc; - if (read_req_set.count > 1) { - /* got multiple read requests, - * build up a flat buffer to include them all */ - flatcc_builder_t builder; - flatcc_builder_init(&builder); - - /* create request vector */ - unifyfs_Extent_vec_start(&builder); - - /* fill in values for each request entry */ - for (i = 0; i < read_req_set.count; i++) { - unifyfs_Extent_vec_push_create(&builder, - read_req_set.read_reqs[i].fid, - read_req_set.read_reqs[i].offset, - read_req_set.read_reqs[i].length); + /* update aiocb fields to record error status and return value */ + for (i = 0; i < reqcnt; i++) { + for (ndx = 0; ndx < nitems; ndx++) { + cbp = aiocb_list[ndx]; + if (cbp == reqs[i].aiocbp) { + AIOCB_ERROR_CODE(cbp) = + unifyfs_rc_errno((unifyfs_rc)(reqs[i].errcode)); + if (0 == reqs[i].errcode) { + AIOCB_RETURN_VAL(cbp) = reqs[i].length; + } + break; // continue outer loop + } + } } - - /* complete the array */ - unifyfs_Extent_vec_ref_t extents = unifyfs_Extent_vec_end(&builder); - unifyfs_ReadRequest_create_as_root(&builder, extents); - //unifyfs_ReadRequest_end_as_root(&builder); - - /* allocate our buffer to be sent */ - size_t size = 0; - void* buffer = flatcc_builder_finalize_buffer(&builder, &size); - assert(buffer); - LOGDBG("mread: n_reqs:%d, flatcc buffer (%p) sz:%zu", - read_req_set.count, buffer, size); - - /* invoke read rpc here */ - read_rc = invoke_client_mread_rpc(read_req_set.count, size, buffer); - - flatcc_builder_clear(&builder); - free(buffer); - } else { - /* got a single read request */ - int gfid = ptr_meta_entry->gfid; - size_t offset = read_req_set.read_reqs[0].offset; - size_t length = read_req_set.read_reqs[0].length; - LOGDBG("read: offset:%zu, len:%zu", offset, length); - read_rc = invoke_client_read_rpc(gfid, offset, length); } - /* bail out with error if we failed to even start the read */ - if (read_rc != UNIFYFS_SUCCESS) { - return read_rc; - } + free(reqs); - /* - * ToDo: Exception handling when some of the requests - * are missed - * */ - - int done = 0; - while (!done) { - int tmp_rc = delegator_wait(); - if (tmp_rc != UNIFYFS_SUCCESS) { - rc = UNIFYFS_FAILURE; - done = 1; - } else { - tmp_rc = process_read_data(read_reqs, count, &done); - if (tmp_rc != UNIFYFS_SUCCESS) { - rc = UNIFYFS_FAILURE; - } - delegator_signal(); - } + if (ret) { + errno = unifyfs_rc_errno(ret); + ret = -1; } - - return rc; + return ret; } +#endif ssize_t UNIFYFS_WRAP(pread)(int fd, void* buf, size_t count, off_t offset) { @@ -1954,44 +1657,37 @@ ssize_t UNIFYFS_WRAP(pread)(int fd, void* buf, size_t count, off_t offset) return (ssize_t)(-1); } -#if 0 // THIS IS BROKEN UNTIL WE HAVE GLOBAL SIZE - /* get pointer to file descriptor structure */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta == NULL) { - /* ERROR: invalid file descriptor */ - errno = EBADF; - return (ssize_t)(-1); - } - - /* check for end of file */ - if (offset >= meta->size) { - return 0; - } -#endif - - /* assume we'll succeed in read */ - size_t retcount = count; - - read_req_t tmp_req; - tmp_req.fid = fid; - tmp_req.offset = offset; - tmp_req.length = count; - tmp_req.errcode = UNIFYFS_SUCCESS; - tmp_req.buf = buf; + /* TODO: handle error if sync fails? */ + /* sync data for file before reading, if needed */ + unifyfs_fid_sync(fid); - int ret = unifyfs_fd_logreadlist(&tmp_req, 1); + /* fill in read request */ + read_req_t req; + req.gfid = unifyfs_gfid_from_fid(fid); + req.offset = offset; + req.length = count; + req.nread = 0; + req.errcode = EINPROGRESS; + req.buf = buf; + + /* execute read operation */ + ssize_t retcount; + int ret = unifyfs_gfid_read_reqs(&req, 1); if (ret != UNIFYFS_SUCCESS) { /* error reading data */ - errno = EIO; + errno = unifyfs_rc_errno(ret); retcount = -1; - } else if (tmp_req.errcode != UNIFYFS_SUCCESS) { + } else if (req.errcode != UNIFYFS_SUCCESS) { /* error reading data */ - errno = EIO; + errno = unifyfs_rc_errno((unifyfs_rc)req.errcode); retcount = -1; + } else { + /* read succeeded, get number of bytes from nread field */ + retcount = (ssize_t) req.nread; } /* return number of bytes read */ - return (ssize_t) retcount; + return retcount; } else { MAP_OR_FAIL(pread); ssize_t ret = UNIFYFS_REAL(pread)(fd, buf, count, offset); @@ -2002,8 +1698,9 @@ ssize_t UNIFYFS_WRAP(pread)(int fd, void* buf, size_t count, off_t offset) ssize_t UNIFYFS_WRAP(pread64)(int fd, void* buf, size_t count, off64_t offset) { /* check whether we should intercept this file descriptor */ + int origfd = fd; if (unifyfs_intercept_fd(&fd)) { - return UNIFYFS_WRAP(pread)(fd, buf, count, (off_t)offset); + return UNIFYFS_WRAP(pread)(origfd, buf, count, (off_t)offset); } else { MAP_OR_FAIL(pread64); ssize_t ret = UNIFYFS_REAL(pread64)(fd, buf, count, offset); @@ -2027,14 +1724,17 @@ ssize_t UNIFYFS_WRAP(pwrite)(int fd, const void* buf, size_t count, } /* write data to file */ - int write_rc = unifyfs_fd_write(fd, offset, buf, count); + size_t bytes; + LOGDBG("pwrite - fd=%d offset=%zu count=%zu", + fd, (size_t)offset, count); + int write_rc = unifyfs_fd_write(fd, offset, buf, count, &bytes); if (write_rc != UNIFYFS_SUCCESS) { - errno = unifyfs_err_map_to_errno(write_rc); + errno = unifyfs_rc_errno(write_rc); return (ssize_t)(-1); } - /* return number of bytes read */ - return (ssize_t) count; + /* return number of bytes written */ + return (ssize_t)bytes; } else { MAP_OR_FAIL(pwrite); ssize_t ret = UNIFYFS_REAL(pwrite)(fd, buf, count, offset); @@ -2046,8 +1746,9 @@ ssize_t UNIFYFS_WRAP(pwrite64)(int fd, const void* buf, size_t count, off64_t offset) { /* check whether we should intercept this file descriptor */ + int origfd = fd; if (unifyfs_intercept_fd(&fd)) { - return UNIFYFS_WRAP(pwrite)(fd, buf, count, (off_t)offset); + return UNIFYFS_WRAP(pwrite)(origfd, buf, count, (off_t)offset); } else { MAP_OR_FAIL(pwrite64); ssize_t ret = UNIFYFS_REAL(pwrite64)(fd, buf, count, offset); @@ -2055,6 +1756,65 @@ ssize_t UNIFYFS_WRAP(pwrite64)(int fd, const void* buf, size_t count, } } +int UNIFYFS_WRAP(fchdir)(int fd) +{ + /* determine whether we should intercept this path */ + if (unifyfs_intercept_fd(&fd)) { + /* lookup file id for file descriptor */ + int fid = unifyfs_get_fid_from_fd(fd); + if (fid < 0) { + errno = EBADF; + return -1; + } + + /* lookup path for fd */ + const char* path = unifyfs_path_from_fid(fid); + + /* TODO: test that path is not a file? */ + + /* we're happy to change into any directory in unifyfs + * should we check that we don't change into a file at least? */ + if (unifyfs_cwd != NULL) { + free(unifyfs_cwd); + } + unifyfs_cwd = strdup(path); + return 0; + } else { + MAP_OR_FAIL(fchdir); + int ret = UNIFYFS_REAL(fchdir)(fd); + + /* if the change dir was successful, + * update our current working directory */ + if (unifyfs_initialized && ret == 0) { + if (unifyfs_cwd != NULL) { + free(unifyfs_cwd); + } + + /* if we did a real chdir, let's use a real getcwd + * to get the current working directory */ + MAP_OR_FAIL(getcwd); + char* cwd = UNIFYFS_REAL(getcwd)(NULL, 0); + if (cwd != NULL) { + unifyfs_cwd = cwd; + + /* parts of the code may assume unifyfs_cwd is a max size */ + size_t len = strlen(cwd) + 1; + if (len > UNIFYFS_MAX_FILENAME) { + LOGERR("Current working dir longer (%lu bytes) " + "than UNIFYFS_MAX_FILENAME=%d", + (unsigned long) len, UNIFYFS_MAX_FILENAME); + } + } else { + /* ERROR */ + LOGERR("Failed to getcwd after fchdir(%d) errno=%d %s", + fd, errno, strerror(errno)); + } + } + + return ret; + } +} + int UNIFYFS_WRAP(ftruncate)(int fd, off_t length) { /* check whether we should intercept this file descriptor */ @@ -2077,7 +1837,7 @@ int UNIFYFS_WRAP(ftruncate)(int fd, off_t length) /* truncate the file */ int rc = unifyfs_fid_truncate(fid, length); if (rc != UNIFYFS_SUCCESS) { - errno = EIO; + errno = unifyfs_rc_errno(rc); return -1; } @@ -2089,31 +1849,6 @@ int UNIFYFS_WRAP(ftruncate)(int fd, off_t length) } } -/* get the gfid for use in fsync wrapper - * TODO: maybe move this somewhere else */ -uint32_t get_gfid(int fid) -{ - unifyfs_file_attr_t target; - target.fid = fid; - - const void* entries = unifyfs_fattrs.meta_entry; - size_t num = *unifyfs_fattrs.ptr_num_entries; - size_t size = sizeof(unifyfs_file_attr_t); - - unifyfs_file_attr_t* entry = - (unifyfs_file_attr_t*) bsearch(&target, entries, num, size, - compare_fattr); - - uint32_t gfid; - if (entry != NULL) { - gfid = (uint32_t)entry->gfid; - } else { - return -1; - } - - return gfid; -} - int UNIFYFS_WRAP(fsync)(int fd) { /* check whether we should intercept this file descriptor */ @@ -2121,34 +1856,19 @@ int UNIFYFS_WRAP(fsync)(int fd) /* get the file id for this file descriptor */ int fid = unifyfs_get_fid_from_fd(fd); if (fid < 0) { + LOGERR("Couldn't get fid from fd %d", fd); errno = EBADF; return -1; } - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (!meta->needs_sync) { - return 0; - } - - /* if using spill over, fsync spillover data to disk */ - if (unifyfs_use_spillover) { - int ret = __real_fsync(unifyfs_spilloverblock); - if (ret != 0) { - /* error, need to set errno appropriately, - * we called the real fsync which should - * have already set errno to something reasonable */ - return -1; - } - } - - /* if using LOGIO, call fsync rpc */ - if (meta->storage == FILE_STORAGE_LOGIO) { - /* invoke fsync rpc to register index metadata with server */ - int gfid = get_gfid(fid); - invoke_client_fsync_rpc(gfid); + /* invoke fsync rpc to register index metadata with server */ + int ret = unifyfs_fid_sync(fid); + if (ret != UNIFYFS_SUCCESS) { + /* sync failed for some reason, set errno and return error */ + errno = unifyfs_rc_errno(ret); + return -1; } - meta->needs_sync = 0; return 0; } else { MAP_OR_FAIL(fsync); @@ -2319,8 +2039,11 @@ void* UNIFYFS_WRAP(mmap64)(void* addr, size_t length, int prot, int flags, int fd, off64_t offset) { /* check whether we should intercept this file descriptor */ + int origfd = fd; if (unifyfs_intercept_fd(&fd)) { - return UNIFYFS_WRAP(mmap)(addr, length, prot, flags, fd, (off_t)offset); + void* ret = UNIFYFS_WRAP(mmap)(addr, length, prot, flags, origfd, + (off_t)offset); + return ret; } else { MAP_OR_FAIL(mmap64); void* ret = UNIFYFS_REAL(mmap64)(addr, length, prot, flags, fd, offset); @@ -2331,7 +2054,6 @@ void* UNIFYFS_WRAP(mmap64)(void* addr, size_t length, int prot, int flags, int UNIFYFS_WRAP(close)(int fd) { /* check whether we should intercept this file descriptor */ - int origfd = fd; if (unifyfs_intercept_fd(&fd)) { LOGDBG("closing fd %d", fd); @@ -2351,15 +2073,19 @@ int UNIFYFS_WRAP(close)(int fd) return -1; } - /* if file was opened for writing, fsync it */ + /* if file was opened for writing, sync it */ if (filedesc->write) { - UNIFYFS_WRAP(fsync)(origfd); + int sync_rc = unifyfs_fid_sync(fid); + if (sync_rc != UNIFYFS_SUCCESS) { + errno = unifyfs_rc_errno(sync_rc); + return -1; + } } /* close the file id */ int close_rc = unifyfs_fid_close(fid); if (close_rc != UNIFYFS_SUCCESS) { - errno = EIO; + errno = unifyfs_rc_errno(close_rc); return -1; } @@ -2382,28 +2108,29 @@ int UNIFYFS_WRAP(close)(int fd) /* Helper function used by fchmod() and chmod() */ static int __chmod(int fid, mode_t mode) { - int gfid; - unifyfs_filemeta_t* meta; - const char* path; int ret; - path = unifyfs_path_from_fid(fid); + /* get path for printing debug messages */ + const char* path = unifyfs_path_from_fid(fid); - meta = unifyfs_get_meta_from_fid(fid); - if (!meta) { - LOGDBG("chmod: %s no metadata info", path); + /* lookup metadata for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if (NULL == meta) { + LOGDBG("no metadata info for %s", path); errno = ENOENT; return -1; } /* Once a file is laminated, you can't modify it in any way */ if (meta->is_laminated) { - LOGDBG("chmod: %s is already laminated", path); + LOGDBG("%s is already laminated", path); errno = EROFS; return -1; } - gfid = unifyfs_generate_gfid(path); + /* found file, and it's not yet laminated, + * get the global file id */ + int gfid = unifyfs_gfid_from_fid(fid); /* * If the chmod clears all the existing write bits, then it's a laminate. @@ -2413,40 +2140,52 @@ static int __chmod(int fid, mode_t mode) */ if ((meta->mode & 0222) && (((meta->mode & 0222) & mode) == 0)) { - - /* - * We're laminating. Calculate the file size so we can cache it - * (both locally and on the server). - */ - ret = invoke_client_filesize_rpc(gfid, &meta->size); + /* We're laminating. */ + ret = invoke_client_laminate_rpc(gfid); if (ret) { - LOGERR("chmod: couldn't get the global file size on laminate"); - errno = EIO; + LOGERR("laminate failed"); + errno = unifyfs_rc_errno(ret); return -1; } - meta->is_laminated = 1; } /* Clear out our old permission bits, and set the new ones in */ meta->mode = meta->mode & ~0777; meta->mode = meta->mode | mode; - ret = unifyfs_set_global_file_meta(fid, gfid); + /* update the global meta data to reflect new permissions */ + unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CHMOD; + ret = unifyfs_set_global_file_meta_from_fid(fid, op); if (ret) { LOGERR("chmod: can't set global meta entry for %s (fid:%d)", path, fid); - errno = EIO; + errno = unifyfs_rc_errno(ret); + return -1; + } + + /* read metadata back to pick up file size and laminated flag */ + unifyfs_file_attr_t attr = {0}; + ret = unifyfs_get_global_file_meta(gfid, &attr); + if (ret) { + LOGERR("chmod: can't get global meta entry for %s (fid:%d)", + path, fid); + errno = unifyfs_rc_errno(ret); return -1; } + + LOGDBG("attributes from global metadata"); + debug_print_file_attr(&attr); + + /* update global size of file from global metadata */ + unifyfs_fid_update_file_meta(fid, &attr); + return 0; } int UNIFYFS_WRAP(fchmod)(int fd, mode_t mode) { /* check whether we should intercept this file descriptor */ - int origfd = fd; if (unifyfs_intercept_fd(&fd)) { - /* TODO: what to do if underlying file has been deleted? */ /* check that fd is actually in use */ @@ -2455,9 +2194,9 @@ int UNIFYFS_WRAP(fchmod)(int fd, mode_t mode) errno = EBADF; return -1; } + LOGDBG("fchmod: setting fd %d to %o", fd, mode); return __chmod(fid, mode); - } else { MAP_OR_FAIL(fchmod); int ret = UNIFYFS_REAL(fchmod)(fd, mode); @@ -2465,24 +2204,21 @@ int UNIFYFS_WRAP(fchmod)(int fd, mode_t mode) } } - int UNIFYFS_WRAP(chmod)(const char* path, mode_t mode) { - int fid, gfid; - int ret; - unifyfs_filemeta_t* meta; /* determine whether we should intercept this path */ - if (unifyfs_intercept_path(path)) { + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { /* check if path exists */ - fid = unifyfs_get_fid_from_path(path); + int fid = unifyfs_get_fid_from_path(upath); if (fid < 0) { LOGDBG("chmod: unifyfs_get_id_from path failed, returning -1, %s", - path); + upath); errno = ENOENT; return -1; } - LOGDBG("chmod: setting %s to %o", path, mode); + LOGDBG("chmod: setting %s to %o", upath, mode); return __chmod(fid, mode); } else { MAP_OR_FAIL(chmod); diff --git a/client/src/unifyfs-sysio.h b/client/src/unifyfs-sysio.h index d829874f8..a6e93939e 100644 --- a/client/src/unifyfs-sysio.h +++ b/client/src/unifyfs-sysio.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -55,15 +55,20 @@ UNIFYFS_DECL(access, int, (const char* pathname, int mode)); UNIFYFS_DECL(mkdir, int, (const char* path, mode_t mode)); UNIFYFS_DECL(rmdir, int, (const char* path)); +UNIFYFS_DECL(chdir, int, (const char* path)); +UNIFYFS_DECL(__getcwd_chk, char*, (char* path, size_t, size_t)); +UNIFYFS_DECL(getcwd, char*, (char* path, size_t)); +UNIFYFS_DECL(getwd, char*, (char* path)); +UNIFYFS_DECL(get_current_dir_name, char*, (void)); UNIFYFS_DECL(unlink, int, (const char* path)); UNIFYFS_DECL(remove, int, (const char* path)); UNIFYFS_DECL(rename, int, (const char* oldpath, const char* newpath)); UNIFYFS_DECL(truncate, int, (const char* path, off_t length)); UNIFYFS_DECL(stat, int, (const char* path, struct stat* buf)); -//UNIFYFS_DECL(fstat, int, (int fd, struct stat* buf)); UNIFYFS_DECL(__xstat, int, (int vers, const char* path, struct stat* buf)); UNIFYFS_DECL(__lxstat, int, (int vers, const char* path, struct stat* buf)); -UNIFYFS_DECL(__fxstat, int, (int vers, int fd, struct stat* buf)); +UNIFYFS_DECL(statfs, int, (const char* path, struct statfs* fsbuf)); + /* --------------------------------------- * POSIX wrappers: file descriptors @@ -88,7 +93,11 @@ UNIFYFS_DECL(pwrite64, ssize_t, (int fd, const void* buf, size_t count, UNIFYFS_DECL(posix_fadvise, int, (int fd, off_t offset, off_t len, int advice)); UNIFYFS_DECL(lseek, off_t, (int fd, off_t offset, int whence)); UNIFYFS_DECL(lseek64, off64_t, (int fd, off64_t offset, int whence)); +UNIFYFS_DECL(fchdir, int, (int fd)); UNIFYFS_DECL(ftruncate, int, (int fd, off_t length)); +UNIFYFS_DECL(fstat, int, (int fd, struct stat* buf)); +UNIFYFS_DECL(__fxstat, int, (int vers, int fd, struct stat* buf)); +UNIFYFS_DECL(fstatfs, int, (int fd, struct statfs* fsbuf)); UNIFYFS_DECL(fsync, int, (int fd)); UNIFYFS_DECL(fdatasync, int, (int fd)); UNIFYFS_DECL(flock, int, (int fd, int operation)); @@ -103,18 +112,31 @@ UNIFYFS_DECL(close, int, (int fd)); UNIFYFS_DECL(lio_listio, int, (int mode, struct aiocb* const aiocb_list[], int nitems, struct sigevent* sevp)); -/* read count bytes info buf from file starting at offset pos, - * returns number of bytes actually read in retcount, - * retcount will be less than count only if an error occurs - * or end of file is reached */ -int unifyfs_fd_read(int fd, off_t pos, void* buf, size_t count, - size_t* retcount); +/* + * Read 'count' bytes info 'buf' from file starting at offset 'pos'. + * Returns UNIFYFS_SUCCESS and sets number of bytes actually read in bytes + * on success. Otherwise returns error code on error. + */ +int unifyfs_fd_read( + int fd, /* file descriptor to read from */ + off_t pos, /* offset within file to read from */ + void* buf, /* buffer to hold data */ + size_t count, /* number of bytes to read */ + size_t* nread /* number of bytes read */ +); -/* write count bytes from buf into file starting at offset pos, - * allocates new bytes and updates file size as necessary, - * fills any gaps with zeros */ -int unifyfs_fd_write(int fd, off_t pos, const void* buf, size_t count); -int unifyfs_fd_logreadlist(read_req_t* read_req, int count); +/* + * Write 'count' bytes from 'buf' into file starting at offset 'pos'. + * Returns UNIFYFS_SUCCESS and sets number of bytes actually written in bytes + * on success. Otherwise returns error code on error. + */ +int unifyfs_fd_write( + int fd, /* file descriptor to write to */ + off_t pos, /* offset within file to write to */ + const void* buf, /* buffer holding data to write */ + size_t count, /* number of bytes to write */ + size_t* nwritten /* number of bytes written */ +); #include "unifyfs-dirops.h" diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index 1626916d7..3c876bde5 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -42,26 +42,18 @@ #include "unifyfs-internal.h" #include "unifyfs-fixed.h" -#include "unifyfs_runstate.h" #include -#include -#include - -#ifdef HAVE_LIBNUMA -#include -#endif - -#ifdef UNIFYFS_GOTCHA -#include "gotcha/gotcha_types.h" -#include "gotcha/gotcha.h" -#include "gotcha_map_unifyfs_list.h" -#endif #include "unifyfs_client_rpcs.h" #include "unifyfs_server_rpcs.h" #include "unifyfs_rpc_util.h" #include "margo_client.h" +#include "seg_tree.h" + +#ifdef HAVE_SPATH +#include "spath.h" +#endif /* HAVE_SPATH */ /* avoid duplicate mounts (for now) */ static int unifyfs_mounted = -1; @@ -71,50 +63,27 @@ static int unifyfs_fpos_enabled = 1; /* whether we can use fgetpos/fsetpos */ unifyfs_cfg_t client_cfg; unifyfs_index_buf_t unifyfs_indices; -unifyfs_fattr_buf_t unifyfs_fattrs; static size_t unifyfs_index_buf_size; /* size of metadata log */ -static size_t unifyfs_fattr_buf_size; unsigned long unifyfs_max_index_entries; /* max metadata log entries */ -unsigned int unifyfs_max_fattr_entries; -int unifyfs_spillmetablock; int global_rank_cnt; /* count of world ranks */ -int local_rank_cnt; -int local_rank_idx; - -int local_del_cnt = 1; -int client_sockfd = -1; -struct pollfd cmd_fd; - -/* shared memory buffer to transfer read requests - * from client to server */ -static char shm_req_name[GEN_STR_LEN] = {0}; -static size_t shm_req_size = UNIFYFS_SHMEM_REQ_SIZE; -void* shm_req_buf; +int client_rank; /* client-provided rank (for debugging) */ /* shared memory buffer to transfer read replies * from server to client */ -static char shm_recv_name[GEN_STR_LEN] = {0}; -static size_t shm_recv_size = UNIFYFS_SHMEM_RECV_SIZE; -void* shm_recv_buf; +shm_context* shm_recv_ctx; // = NULL -char cmd_buf[CMD_BUF_SIZE] = {0}; - -int client_rank; -int app_id; -size_t unifyfs_key_slice_range; - -/* whether chunks should be allocated to - * store file contents in memory */ -int unifyfs_use_memfs = 1; - -/* whether chunks should be allocated to - * store file contents on spill over device */ -int unifyfs_use_spillover = 1; +int unifyfs_app_id; +int unifyfs_client_id; static int unifyfs_use_single_shm = 0; static int unifyfs_page_size = 0; +/* Determine whether we automatically sync every write to server. + * This slows write performance, but it can serve as a work + * around for apps that do not have all necessary syncs. */ +static bool unifyfs_write_sync; + static off_t unifyfs_max_offt; static off_t unifyfs_min_offt; static off_t unifyfs_max_long; @@ -122,45 +91,24 @@ static off_t unifyfs_min_long; /* TODO: moved these to fixed file */ int unifyfs_max_files; /* maximum number of files to store */ -size_t unifyfs_chunk_mem; /* number of bytes in memory to be used for chunk storage */ -int unifyfs_chunk_bits; /* we set chunk size = 2^unifyfs_chunk_bits */ -off_t unifyfs_chunk_size; /* chunk size in bytes */ -off_t unifyfs_chunk_mask; /* mask applied to logical offset to determine physical offset within chunk */ -long unifyfs_max_chunks; /* maximum number of chunks that fit in memory */ - -/* number of bytes in spillover to be used for chunk storage */ -static size_t unifyfs_spillover_size; - -/* maximum number of chunks that fit in spillover storage */ -long unifyfs_spillover_max_chunks; - -#ifdef HAVE_LIBNUMA -static char unifyfs_numa_policy[10]; -static int unifyfs_numa_bank = -1; -#endif +bool unifyfs_local_extents; /* track data extents in client to read local */ -extern pthread_mutex_t unifyfs_stack_mutex; +/* log-based I/O context */ +logio_context* logio_ctx; /* keep track of what we've initialized */ int unifyfs_initialized = 0; -/* shared memory for superblock */ -static char shm_super_name[GEN_STR_LEN] = {0}; -static size_t shm_super_size; +/* superblock - persistent shared memory region (metadata + data) */ +static shm_context* shm_super_ctx; -/* global persistent memory block (metadata + data) */ -void* shm_super_buf; +/* per-file metadata */ static void* free_fid_stack; -void* free_chunk_stack; -void* free_spillchunk_stack; unifyfs_filename_t* unifyfs_filelist; static unifyfs_filemeta_t* unifyfs_filemetas; -unifyfs_chunkmeta_t* unifyfs_chunkmetas; - -char* unifyfs_chunks; -int unifyfs_spilloverblock = 0; -int unifyfs_spillmetablock = 0; /*used for log-structured i/o*/ +/* TODO: metadata spillover is not currently supported */ +int unifyfs_spillmetablock = -1; /* array of file descriptors */ unifyfs_fd_t unifyfs_fds[UNIFYFS_MAX_FILEDESCS]; @@ -195,16 +143,13 @@ void* unifyfs_dirstream_stack; /* mount point information */ char* unifyfs_mount_prefix; size_t unifyfs_mount_prefixlen = 0; -static key_t unifyfs_mount_shmget_key = 0; + +/* to track current working directory */ +char* unifyfs_cwd; /* mutex to lock stack operations */ pthread_mutex_t unifyfs_stack_mutex = PTHREAD_MUTEX_INITIALIZER; -/* path of external storage's mount point*/ - -char external_data_dir[UNIFYFS_MAX_FILENAME] = {0}; -char external_meta_dir[UNIFYFS_MAX_FILENAME] = {0}; - /* single function to route all unsupported wrapper calls through */ int unifyfs_vunsupported( const char* fn_name, @@ -243,82 +188,6 @@ int unifyfs_unsupported( return rc; } -/* given an UNIFYFS error code, return corresponding errno code */ -int unifyfs_err_map_to_errno(int rc) -{ - unifyfs_error_e ec = (unifyfs_error_e)rc; - - switch (ec) { - case UNIFYFS_SUCCESS: - return 0; - case UNIFYFS_ERROR_BADF: - return EBADF; - case UNIFYFS_ERROR_EXIST: - return EEXIST; - case UNIFYFS_ERROR_FBIG: - return EFBIG; - case UNIFYFS_ERROR_INVAL: - return EINVAL; - case UNIFYFS_ERROR_ISDIR: - return EISDIR; - case UNIFYFS_ERROR_NAMETOOLONG: - return ENAMETOOLONG; - case UNIFYFS_ERROR_NFILE: - return ENFILE; - case UNIFYFS_ERROR_NOENT: - return ENOENT; - case UNIFYFS_ERROR_NOMEM: - return ENOMEM; - case UNIFYFS_ERROR_NOSPC: - return ENOSPC; - case UNIFYFS_ERROR_NOTDIR: - return ENOTDIR; - case UNIFYFS_ERROR_OVERFLOW: - return EOVERFLOW; - default: - break; - } - return ec; -} - -/* given an errno error code, return corresponding UnifyFS error code */ -int unifyfs_errno_map_to_err(int rc) -{ - switch (rc) { - case 0: - return (int)UNIFYFS_SUCCESS; - case EBADF: - return (int)UNIFYFS_ERROR_BADF; - case EEXIST: - return (int)UNIFYFS_ERROR_EXIST; - case EFBIG: - return (int)UNIFYFS_ERROR_FBIG; - case EINVAL: - return (int)UNIFYFS_ERROR_INVAL; - case EIO: - return (int)UNIFYFS_ERROR_IO; - case EISDIR: - return (int)UNIFYFS_ERROR_ISDIR; - case ENAMETOOLONG: - return (int)UNIFYFS_ERROR_NAMETOOLONG; - case ENFILE: - return (int)UNIFYFS_ERROR_NFILE; - case ENOENT: - return (int)UNIFYFS_ERROR_NOENT; - case ENOMEM: - return (int)UNIFYFS_ERROR_NOMEM; - case ENOSPC: - return (int)UNIFYFS_ERROR_NOSPC; - case ENOTDIR: - return (int)UNIFYFS_ERROR_NOTDIR; - case EOVERFLOW: - return (int)UNIFYFS_ERROR_OVERFLOW; - default: - break; - } - return (int)UNIFYFS_FAILURE; -} - /* returns 1 if two input parameters will overflow their type when * added together */ inline int unifyfs_would_overflow_offt(off_t a, off_t b) @@ -383,26 +252,8 @@ inline int unifyfs_would_overflow_long(long a, long b) return 0; } -/* given an input mode, mask it with umask and return, can specify - * an input mode==0 to specify all read/write bits */ -mode_t unifyfs_getmode(mode_t perms) -{ - /* perms == 0 is shorthand for all read and write bits */ - if (perms == 0) { - perms = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; - } - - /* get current user mask */ - mode_t mask = umask(0); - umask(mask); - - /* mask off bits from desired permissions */ - mode_t ret = perms & ~mask & 0777; - return ret; -} - /* lock access to shared data structures in superblock */ -inline int unifyfs_stack_lock() +inline int unifyfs_stack_lock(void) { if (unifyfs_use_single_shm) { return pthread_mutex_lock(&unifyfs_stack_mutex); @@ -411,7 +262,7 @@ inline int unifyfs_stack_lock() } /* unlock access to shared data structures in superblock */ -inline int unifyfs_stack_unlock() +inline int unifyfs_stack_unlock(void) { if (unifyfs_use_single_shm) { return pthread_mutex_unlock(&unifyfs_stack_mutex); @@ -419,19 +270,57 @@ inline int unifyfs_stack_unlock() return 0; } +static void unifyfs_normalize_path(const char* path, char* normalized) +{ + /* if we have a relative path, prepend the current working directory */ + if (path[0] != '/' && unifyfs_cwd != NULL) { + /* got a relative path, add our cwd */ + snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s/%s", unifyfs_cwd, path); + } else { + snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", path); + } + +#ifdef HAVE_SPATH + /* normalize path to handle '.', '..', + * and extra or trailing '/' characters */ + char* str = spath_strdup_reduce_str(normalized); + snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", str); + free(str); +#endif /* HAVE_SPATH */ +} + /* sets flag if the path is a special path */ -inline int unifyfs_intercept_path(const char* path) +inline int unifyfs_intercept_path(const char* path, char* upath) { /* don't intecept anything until we're initialized */ if (!unifyfs_initialized) { return 0; } + /* if we have a relative path, prepend the current working directory */ + char target[UNIFYFS_MAX_FILENAME]; + unifyfs_normalize_path(path, target); + /* if the path starts with our mount point, intercept it */ - if (strncmp(path, unifyfs_mount_prefix, unifyfs_mount_prefixlen) == 0) { - return 1; + int intercept = 0; + if (strncmp(target, unifyfs_mount_prefix, unifyfs_mount_prefixlen) == 0) { + /* characters in target up through mount point match, + * assume we match */ + intercept = 1; + + /* if we have another character, it must be '/' */ + if (strlen(target) > unifyfs_mount_prefixlen && + target[unifyfs_mount_prefixlen] != '/') { + intercept = 0; + } } - return 0; + + /* copy normalized path into upath */ + if (intercept) { + strncpy(upath, target, UNIFYFS_MAX_FILENAME); + } + + return intercept; } /* given an fd, return 1 if we should intercept this file, 0 otherwise, @@ -611,314 +500,1068 @@ unifyfs_filemeta_t* unifyfs_get_meta_from_fid(int fid) return NULL; } +int unifyfs_fid_is_laminated(int fid) +{ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if ((meta != NULL) && (meta->fid == fid)) { + return meta->is_laminated; + } + return 0; +} + +int unifyfs_fd_is_laminated(int fd) +{ + int fid = unifyfs_get_fid_from_fd(fd); + return unifyfs_fid_is_laminated(fid); +} + /* --------------------------------------- * Operations on file storage * --------------------------------------- */ /* allocate and initialize data management resource for file */ -static int unifyfs_fid_store_alloc(int fid) +static int fid_store_alloc(int fid) { /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if ((meta != NULL) && (meta->fid == fid)) { + /* indicate that we're using LOGIO to store data for this file */ + meta->storage = FILE_STORAGE_LOGIO; - /* indicate that we're using LOGIO to store data for this file */ - meta->storage = FILE_STORAGE_LOGIO; + /* Initialize our segment tree that will record our writes */ + int rc = seg_tree_init(&meta->extents_sync); + if (rc != 0) { + return rc; + } - return UNIFYFS_SUCCESS; + /* Initialize our segment tree to track extents for all writes + * by this process, can be used to read back local data */ + if (unifyfs_local_extents) { + rc = seg_tree_init(&meta->extents); + if (rc != 0) { + return rc; + } + } + + return UNIFYFS_SUCCESS; + } else { + LOGERR("failed to get filemeta for fid=%d", fid); + } + return UNIFYFS_FAILURE; } /* free data management resource for file */ -static int unifyfs_fid_store_free(int fid) +static int fid_store_free(int fid) { /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if ((meta != NULL) && (meta->fid == fid)) { + /* set storage type back to NULL */ + meta->storage = FILE_STORAGE_NULL; - /* set storage type back to NULL */ - meta->storage = FILE_STORAGE_NULL; + /* Free our write seg_tree */ + seg_tree_destroy(&meta->extents_sync); - return UNIFYFS_SUCCESS; + /* Free our extent seg_tree */ + if (unifyfs_local_extents) { + seg_tree_destroy(&meta->extents); + } + + return UNIFYFS_SUCCESS; + } + return UNIFYFS_FAILURE; } -/* --------------------------------------- - * Operations on file ids - * --------------------------------------- */ +/* ======================================= + * Operations on global file ids + * ======================================= */ -/* checks to see if fid is a directory - * returns 1 for yes - * returns 0 for no */ -int unifyfs_fid_is_dir(int fid) +/* order by file id then by offset */ +static int compare_read_req(const void* a, const void* b) { - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta && meta->mode & S_IFDIR) { - return 1; - } else { - /* if it doesn't exist, then it's not a directory? */ + const read_req_t* ptr_a = a; + const read_req_t* ptr_b = b; + + if (ptr_a->gfid != ptr_b->gfid) { + if (ptr_a->gfid < ptr_b->gfid) { + return -1; + } else { + return 1; + } + } + + if (ptr_a->offset == ptr_b->offset) { return 0; + } else if (ptr_a->offset < ptr_b->offset) { + return -1; + } else { + return 1; } } -/* - * hash a path to gfid - * @param path: file path - * return: gfid - */ -int unifyfs_generate_gfid(const char* path) +/* notify our reqmgr that the shared memory buffer + * is now clear and ready to hold more read data */ +static void delegator_signal(void) { - unsigned char digested[16] = { 0, }; - unsigned long len = strlen(path); - int* ival = (int*) digested; + LOGDBG("receive buffer now empty"); - MD5((const unsigned char*) path, len, digested); + /* set shm flag to signal reqmgr we're done */ + shm_data_header* hdr = (shm_data_header*)(shm_recv_ctx->addr); + hdr->state = SHMEM_REGION_EMPTY; - return abs(ival[0]); + /* TODO: MEM_FLUSH */ } -static int unifyfs_gfid_from_fid(const int fid) +/* wait for reqmgr to inform us that shared memory buffer + * is filled with read data */ +static int delegator_wait(void) { - /* check that local file id is in range */ - if (fid < 0 || fid >= unifyfs_max_files) { - return -EINVAL; - } + int rc = (int)UNIFYFS_SUCCESS; - /* lookup file name structure for this local file id */ - unifyfs_filename_t* fname = &unifyfs_filelist[fid]; + /* specify time to sleep between checking flag in shared + * memory indicating server has produced */ + struct timespec shm_wait_tm; + shm_wait_tm.tv_sec = 0; + shm_wait_tm.tv_nsec = SHM_WAIT_INTERVAL; - /* generate global file id from path if file is valid */ - if (fname->in_use) { - int gfid = unifyfs_generate_gfid(fname->filename); - return gfid; + /* get pointer to flag in shared memory */ + shm_data_header* hdr = (shm_data_header*)(shm_recv_ctx->addr); + + /* wait for server to set flag to non-zero */ + int max_sleep = 5000000; // 5s + volatile int* vip = (volatile int*)&(hdr->state); + while (*vip == SHMEM_REGION_EMPTY) { + /* not there yet, sleep for a while */ + nanosleep(&shm_wait_tm, NULL); + /* TODO: MEM_FETCH */ + max_sleep--; + if (0 == max_sleep) { + LOGERR("timed out waiting for non-empty"); + rc = (int)UNIFYFS_ERROR_SHMEM; + break; + } } - return -EINVAL; + return rc; } -/* Given a fid, return the path. */ -const char* unifyfs_path_from_fid(int fid) +/* copy read data from shared memory buffer to user buffers from read + * calls, sets done=1 on return when reqmgr informs us it has no + * more data */ +static int process_read_data(read_req_t* read_reqs, int count, int* done) { - unifyfs_filename_t* fname = &unifyfs_filelist[fid]; - if (fname->in_use) { - return fname->filename; - } - return NULL; -} + /* assume we'll succeed */ + int rc = UNIFYFS_SUCCESS; -/* checks to see if a directory is empty - * assumes that check for is_dir has already been made - * only checks for full path matches, does not check relative paths, - * e.g. ../dirname will not work - * returns 1 for yes it is empty - * returns 0 for no */ -int unifyfs_fid_is_dir_empty(const char* path) -{ - int i = 0; - while (i < unifyfs_max_files) { - /* only check this element if it's active */ - if (unifyfs_filelist[i].in_use) { - /* if the file starts with the path, it is inside of that directory - * also check to make sure that it's not the directory entry itself */ - char* strptr = strstr(path, unifyfs_filelist[i].filename); - if (strptr == unifyfs_filelist[i].filename && - strcmp(path, unifyfs_filelist[i].filename) != 0) { - /* found a child item in path */ - LOGDBG("File found: unifyfs_filelist[%d].filename = %s", - i, (char*)&unifyfs_filelist[i].filename); - return 0; + /* get pointer to start of shared memory buffer */ + shm_data_header* shm_hdr = (shm_data_header*)(shm_recv_ctx->addr); + char* shmptr = ((char*)shm_hdr) + sizeof(shm_data_header); + + /* get number of read replies in shared memory */ + size_t num = shm_hdr->meta_cnt; + + /* process each of our read replies */ + size_t i; + for (i = 0; i < num; i++) { + /* get pointer to current read reply header */ + shm_data_meta* rep = (shm_data_meta*)shmptr; + shmptr += sizeof(shm_data_meta); + + /* get pointer to data */ + char* rep_buf = shmptr; + shmptr += rep->length; + + LOGDBG("processing data response from server: " + "[%zu] (gfid=%d, offset=%lu, length=%lu, errcode=%d)", + i, rep->gfid, rep->offset, rep->length, rep->errcode); + + /* get start and end offset of reply */ + size_t rep_start = rep->offset; + size_t rep_end = rep->offset + rep->length; + + /* iterate over each of our read requests */ + size_t j; + for (j = 0; j < count; j++) { + /* get pointer to read request */ + read_req_t* req = &read_reqs[j]; + + /* skip if this request if not the same file */ + if (rep->gfid != req->gfid) { + /* request and reply are for different files */ + continue; + } + + /* same file, now get start and end offsets + * of this read request */ + size_t req_start = req->offset; + size_t req_end = req->offset + req->length; + + /* test whether reply overlaps with request, + * overlap if: + * start of reply comes before the end of request + * AND + * end of reply comes after the start of request */ + int overlap = (rep_start < req_end && rep_end > req_start); + if (!overlap) { + /* reply does not overlap with this request */ + continue; + } + + /* this reply overlaps with the request, check that + * we didn't get an error */ + if (rep->errcode != UNIFYFS_SUCCESS) { + /* TODO: should we look for the reply with an errcode + * with the lowest start offset? */ + + /* read reply has an error, mark the read request + * as also having an error, then quit processing */ + req->errcode = rep->errcode; + continue; + } + + /* otherwise, we have an error-free, overlapping reply + * for this request, copy data into request buffer */ + + /* start of overlapping segment is the maximum of + * reply and request start offsets */ + size_t start = rep_start; + if (req_start > start) { + start = req_start; + } + + /* end of overlapping segment is the mimimum of + * reply and request end offsets */ + size_t end = rep_end; + if (req_end < end) { + end = req_end; + } + + /* compute length of overlapping segment */ + size_t length = end - start; + + /* get number of bytes from start of reply and request + * buffers to the start of the overlap region */ + size_t rep_offset = start - rep_start; + size_t req_offset = start - req_start; + + /* if we have a gap, fill with zeros */ + size_t gap_start = req_start + req->nread; + if (start > gap_start) { + size_t gap_length = start - gap_start; + char* req_ptr = req->buf + req->nread; + memset(req_ptr, 0, gap_length); + } + + /* copy data from reply buffer into request buffer */ + char* req_ptr = req->buf + req_offset; + char* rep_ptr = rep_buf + rep_offset; + memcpy(req_ptr, rep_ptr, length); + + LOGDBG("copied data to application buffer (%lu bytes)", length); + + /* update max number of bytes we have written to in the + * request buffer */ + size_t nread = end - req_start; + if (nread > req->nread) { + req->nread = nread; } } + } - /* go on to next file */ - i++; + /* set done flag if there is no more data */ + if (shm_hdr->state == SHMEM_REGION_DATA_COMPLETE) { + *done = 1; } - /* couldn't find any files with this prefix, dir must be empty */ - return 1; + return rc; } -/* return current size of given file id */ -off_t unifyfs_fid_size(int fid) -{ - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - return meta->size; -} +/* This uses information in the extent map for a file on the client to + * complete any read requests. It only complets a request if it contains + * all of the data. Otherwise the request is copied to the list of + * requests to be handled by the server. */ +static void service_local_reqs( + read_req_t* read_reqs, /* list of input read requests */ + int count, /* number of input read requests */ + read_req_t* local_reqs, /* list to copy requests completed by client */ + read_req_t* server_reqs, /* list to copy requests to be handled by server */ + int* out_count) /* number of items copied to server list */ +{ + /* this will track the total number of requests we're passing + * on to the server */ + int local_count = 0; + int server_count = 0; + + /* iterate over each input read request, satisfy it locally if we can + * otherwise copy request into output list that the server will handle + * for us */ + int i; + for (i = 0; i < count; i++) { + /* get current read request */ + read_req_t* req = &read_reqs[i]; -/* - * insert file attribute to attributed shared memory buffer, - * keep entries ordered by file id - */ -static int ins_file_meta(unifyfs_fattr_buf_t* ptr_f_meta_log, - unifyfs_file_attr_t* ins_fattr) -{ - /* get pointer to start of stat structures in shared memory buffer */ - unifyfs_file_attr_t* meta_entry = ptr_f_meta_log->meta_entry; + /* skip any request that's already completed or errored out, + * we pass those requests on to server */ + if (req->nread >= req->length || req->errcode != UNIFYFS_SUCCESS) { + /* copy current request into list of requests + * that we'll ask server for */ + memcpy(&server_reqs[server_count], req, sizeof(read_req_t)); + server_count++; + continue; + } - /* get number of active entries currently in the buffer */ - int meta_cnt = *(ptr_f_meta_log->ptr_num_entries); + /* get gfid, start, and length of this request */ + int gfid = req->gfid; + size_t req_start = req->offset; + size_t req_end = req->offset + req->length; - /* TODO: Improve the search time */ - /* search backwards until we find an entry whose - * file id is less than the current file id */ - int i; - for (i = meta_cnt - 1; i >= 0; i--) { - if (meta_entry[i].fid <= ins_fattr->fid) { - /* sort in acsending order */ - break; + /* lookup local extents if we have them */ + int fid = unifyfs_fid_from_gfid(gfid); + + /* move to next request if we can't find the matching fid */ + if (fid < 0) { + /* copy current request into list of requests + * that we'll ask server for */ + memcpy(&server_reqs[server_count], req, sizeof(read_req_t)); + server_count++; + continue; + } + + /* get pointer to extents for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); + struct seg_tree* extents = &meta->extents; + + /* lock the extent tree for reading */ + seg_tree_rdlock(extents); + + /* identify whether we can satisfy this full request + * or not, assume we can */ + int have_local = 1; + + /* this will point to the offset of the next byte we + * need to account for */ + size_t expected_start = req_start; + + /* iterate over extents we have for this file, + * and check that there are no holes in coverage, + * we search for a starting extent using a range + * of just the very first byte that we need */ + struct seg_tree_node* first; + first = seg_tree_find_nolock(extents, req_start, req_start); + struct seg_tree_node* next = first; + while (next != NULL && next->start < req_end) { + if (expected_start >= next->start) { + /* this extent has the next byte we expect, + * bump up to the first byte past the end + * of this extent */ + expected_start = next->end + 1; + } else { + /* there is a gap between extents so we're missing + * some bytes */ + have_local = 0; + break; + } + + /* get the next element in the tree */ + next = seg_tree_iter(extents, next); } - } - /* compute position to store stat info for this file */ - int ins_pos = i + 1; + /* check that we account for the full request + * up until the last byte */ + if (expected_start < req_end) { + /* missing some bytes at the end of the request */ + have_local = 0; + } - /* we need to move some entries up a slot to make room - * for this one */ - for (i = meta_cnt - 1; i >= ins_pos; i--) { - meta_entry[i + 1] = meta_entry[i]; - } + /* if we can't fully satisfy the request, copy request to + * output array, so it can be passed on to server */ + if (!have_local) { + /* copy current request into list of requests + * that we'll ask server for */ + memcpy(&server_reqs[server_count], req, sizeof(read_req_t)); + server_count++; - /* insert stat data for this file into buffer */ - meta_entry[ins_pos] = *ins_fattr; + /* release lock before we go to next request */ + seg_tree_unlock(extents); - /* increment our count of active entries */ - (*ptr_f_meta_log->ptr_num_entries)++; + continue; + } - return 0; -} + /* otherwise we can copy the data locally, iterate + * over the extents and copy data into request buffer, + * again search for a starting extent using a range + * of just the very first byte that we need */ + next = first; + while ((next != NULL) && (next->start < req_end)) { + /* get start and end of this extent (reply) */ + size_t rep_start = next->start; + size_t rep_end = next->end + 1; -unifyfs_filemeta_t* meta; -int unifyfs_set_global_file_meta(int fid, int gfid) -{ - int ret = 0; - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - unifyfs_file_attr_t new_fmeta = {0}; - struct timespec tp = {0}; + /* get the offset into the log */ + size_t rep_log_pos = next->ptr; - const char* path = unifyfs_path_from_fid(fid); + /* start of overlapping segment is the maximum of + * reply and request start offsets */ + size_t start = rep_start; + if (req_start > start) { + start = req_start; + } - sprintf(new_fmeta.filename, "%s", path); + /* end of overlapping segment is the mimimum of + * reply and request end offsets */ + size_t end = rep_end; + if (req_end < end) { + end = req_end; + } - new_fmeta.fid = fid; - new_fmeta.gfid = gfid; + /* compute length of overlapping segment */ + size_t length = end - start; - clock_gettime(CLOCK_REALTIME, &tp); - new_fmeta.atime = tp; - new_fmeta.mtime = tp; - new_fmeta.ctime = tp; + /* get number of bytes from start of reply and request + * buffers to the start of the overlap region */ + size_t rep_offset = start - rep_start; + size_t req_offset = start - req_start; - new_fmeta.mode = meta->mode; - new_fmeta.is_laminated = meta->is_laminated; + /* if we have a gap, fill with zeros */ + size_t gap_start = req_start + req->nread; + if (start > gap_start) { + size_t gap_length = start - gap_start; + char* req_ptr = req->buf + req->nread; + memset(req_ptr, 0, gap_length); + } - if (meta->is_laminated) { - /* - * If is_laminated is set, we're either laminating for the first time, - * in which case meta->size will have already been calculated and - * filled in with the global file size, or, the file was already - * laminated. In either case, we write meta->size to the server. - */ - new_fmeta.size = meta->size; - } else { - new_fmeta.size = 0; - } + /* copy data from local write log into request buffer */ + char* req_ptr = req->buf + req_offset; + off_t log_offset = rep_log_pos + rep_offset; + size_t nread = 0; + int rc = unifyfs_logio_read(logio_ctx, log_offset, length, + req_ptr, &nread); + if (rc == UNIFYFS_SUCCESS) { + if (nread < length) { + /* account for short read by updating end offset */ + end -= (length - nread); + } + /* update max number of bytes we have filled in the req buf */ + size_t req_nread = end - req_start; + if (req_nread > req->nread) { + req->nread = req_nread; + } + } else { + LOGERR("local log read failed for offset=%zu size=%zu", + (size_t)log_offset, length); + req->errcode = rc; + } - new_fmeta.uid = getuid(); - new_fmeta.gid = getgid(); + /* get the next element in the tree */ + next = seg_tree_iter(extents, next); + } - ret = invoke_client_metaset_rpc(&new_fmeta); - if (ret < 0) { - return ret; + /* copy request data to list we completed locally */ + memcpy(&local_reqs[local_count], req, sizeof(read_req_t)); + local_count++; + + /* done reading the tree */ + seg_tree_unlock(extents); } - ins_file_meta(&unifyfs_fattrs, &new_fmeta); + /* return to user the number of key/values we set */ + *out_count = server_count; - return 0; + return; } -int unifyfs_get_global_file_meta(int fid, int gfid, unifyfs_file_attr_t* gfattr) +/* + * get data for a list of read requests from the + * reqmgr + * + * @param read_reqs: a list of read requests + * @param count: number of read requests + * @return error code + * */ +int unifyfs_gfid_read_reqs(read_req_t* in_reqs, int in_count) { - if (!gfattr) { - return -EINVAL; - } + int i; + int read_rc; - unifyfs_file_attr_t fmeta; - int ret = invoke_client_metaget_rpc(gfid, &fmeta); - if (ret == UNIFYFS_SUCCESS) { - *gfattr = fmeta; - gfattr->fid = fid; - } + /* assume we'll succeed */ + int rc = UNIFYFS_SUCCESS; - return ret; -} + /* assume we'll service all requests from the server */ + int count = in_count; + read_req_t* read_reqs = in_reqs; -/* fill in limited amount of stat information for global file id */ -int unifyfs_gfid_stat(int gfid, struct stat* buf) -{ - /* check that we have an output buffer to write to */ - if (!buf) { - return UNIFYFS_ERROR_INVAL; - } + /* TODO: if the file is laminated so that we know the file size, + * we can adjust read requests to not read past the EOF */ - /* zero out user's stat buffer */ - memset(buf, 0, sizeof(struct stat)); + /* if the option is enabled to service requests locally, try it, + * in this case we'll allocate a large array which we split into + * two, the first half will record requests we completed locally + * and the second half will store requests to be sent to the server */ - /* lookup stat data for global file id */ - unifyfs_file_attr_t fattr; - int ret = invoke_client_metaget_rpc(gfid, &fattr); - if (ret != UNIFYFS_SUCCESS) { - return ret; - } + /* this records the pointer to the temp request array if + * we allocate one, we should free this later if not NULL */ + read_req_t* reqs = NULL; - /* It was decided that non-laminated files return a file size of 0 */ - if (!fattr.is_laminated) { - fattr.size = 0; - } + /* this will point to the start of the array of requests we + * complete locally */ + read_req_t* local_reqs = NULL; - /* copy stat structure */ - unifyfs_file_attr_to_stat(&fattr, buf); + /* attempt to complete requests locally if enabled */ + if (unifyfs_local_extents) { + /* allocate space to make local and server copies of the requests, + * each list will be at most in_count long */ + size_t reqs_size = 2 * in_count * sizeof(read_req_t); + reqs = (read_req_t*) malloc(reqs_size); + if (reqs == NULL) { + return ENOMEM; + } - return UNIFYFS_SUCCESS; -} + /* define pointers to space where we can build our list + * of requests handled on the client and those left + * for the server */ + local_reqs = &reqs[0]; + read_reqs = &reqs[in_count]; -/* fill in limited amount of stat information */ -int unifyfs_fid_stat(int fid, struct stat* buf) -{ - /* check that fid is defined */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta == NULL) { - return UNIFYFS_ERROR_IO; - } + /* service reads from local extent info if we can, this copies + * completed requests from in_reqs into local_reqs, and it copies + * any requests that can't be completed locally into the read_reqs + * to be processed by the server */ + service_local_reqs(in_reqs, in_count, local_reqs, read_reqs, &count); - /* get global file id corresponding to local file id */ - int gfid = unifyfs_gfid_from_fid(fid); + /* bail early if we satisfied all requests locally */ + if (count == 0) { + /* copy completed requests back into user's array */ + memcpy(in_reqs, local_reqs, in_count * sizeof(read_req_t)); - /* lookup stat info for global file id */ - int ret = unifyfs_gfid_stat(gfid, buf); - if (ret != UNIFYFS_SUCCESS) { - return UNIFYFS_ERROR_IO; + /* free the temporary array */ + free(reqs); + return rc; + } } - return UNIFYFS_SUCCESS; -} + /* TODO: When the number of read requests exceed the + * request buffer, split list io into multiple bulk + * sends and transfer in bulks */ -/* allocate a file id slot for a new file - * return the fid or -1 on error */ -int unifyfs_fid_alloc() -{ - unifyfs_stack_lock(); - int fid = unifyfs_stack_pop(free_fid_stack); - unifyfs_stack_unlock(); - LOGDBG("unifyfs_stack_pop() gave %d", fid); - if (fid < 0) { - /* need to create a new file, but we can't */ - LOGERR("unifyfs_stack_pop() failed (%d)", fid); - return -1; + /* check that we have enough slots for all read requests */ + if (count > UNIFYFS_MAX_READ_CNT) { + LOGERR("Too many requests to pass to server"); + if (reqs != NULL) { + free(reqs); + } + return ENOSPC; } - return fid; -} -/* return the file id back to the free pool */ -int unifyfs_fid_free(int fid) -{ - unifyfs_stack_lock(); - unifyfs_stack_push(free_fid_stack, fid); + /* order read request by increasing file id, then increasing offset */ + qsort(read_reqs, count, sizeof(read_req_t), compare_read_req); + + /* prepare our shared memory buffer for reqmgr */ + delegator_signal(); + + /* for mread, we need to manually track the rpc progress */ + unifyfs_mread_rpc_ctx_t mread_ctx = { 0, }; + + /* we select different rpcs depending on the number of + * read requests */ + if (count > 1) { + /* got multiple read requests */ + size_t size = (size_t)count * sizeof(unifyfs_extent_t); + void* buffer = malloc(size); + if (NULL == buffer) { + return ENOMEM; + } + unifyfs_extent_t* extents = (unifyfs_extent_t*)buffer; + unifyfs_extent_t* ext; + read_req_t* req; + for (i = 0; i < count; i++) { + ext = extents + i; + req = read_reqs + i; + ext->gfid = req->gfid; + ext->offset = req->offset; + ext->length = req->length; + } + + LOGDBG("mread: n_reqs:%d, reqs(%p) sz:%zu", + count, buffer, size); + + /* invoke multi-read rpc */ + read_rc = invoke_client_mread_rpc(count, size, buffer, &mread_ctx); + free(buffer); + } else { + /* got a single read request */ + int gfid = read_reqs[0].gfid; + size_t offset = read_reqs[0].offset; + size_t length = read_reqs[0].length; + + LOGDBG("read: offset:%zu, len:%zu", offset, length); + + /* invoke single read rpc */ + read_rc = invoke_client_read_rpc(gfid, offset, length); + } + + /* ENODATA means server has no extents matching request(s) */ + if (read_rc != ENODATA) { + /* bail out with error if we failed to even start the read */ + if (read_rc != UNIFYFS_SUCCESS) { + LOGERR("Failed to issue read RPC to server"); + if (reqs != NULL) { + free(reqs); + } + return read_rc; + } + + /* spin waiting for read data to come back from the server, + * we process it in batches as it comes in, eventually the + * server will tell us it's sent us everything it can */ + int done = 0; + int rpc_done = 0; + while (!done) { + int tmp_rc = delegator_wait(); + if (tmp_rc != UNIFYFS_SUCCESS) { + rc = UNIFYFS_FAILURE; + done = 1; + } else { + tmp_rc = process_read_data(read_reqs, count, &done); + if (tmp_rc != UNIFYFS_SUCCESS) { + LOGERR("failed to process data from server"); + rc = UNIFYFS_FAILURE; + } + delegator_signal(); + } + + /* if this was mread, track the progress */ + if (count > 1 && !rpc_done) { + tmp_rc = unifyfs_mread_rpc_status_check(&mread_ctx); + if (tmp_rc < 0) { + LOGERR("failed to check the rpc progress"); + continue; + } + + /* if we received a response from the server, check for errors. + * upon finding errors, do not wait anymore. */ + if (tmp_rc) { + LOGDBG("received rpc response from the server (ret=%d)", + mread_ctx.rpc_ret); + + if (mread_ctx.rpc_ret != UNIFYFS_SUCCESS) { + LOGERR("mread rpc failed on server (ret=%d)", + mread_ctx.rpc_ret); + return UNIFYFS_FAILURE; + } + + rpc_done = 1; + } + } + } + LOGDBG("fetched all data from server for %d requests", count); + } + + /* got all of the data we'll get from the server, + * check for short reads and whether those short + * reads are from errors, holes, or the end of the file */ + for (i = 0; i < count; i++) { + /* get pointer to next read request */ + read_req_t* req = &read_reqs[i]; + + /* no error message was received from server, set it success */ + if (req->errcode == EINPROGRESS) { + req->errcode = UNIFYFS_SUCCESS; + } + + /* if we hit an error on our read, nothing else to do */ + if (req->errcode != UNIFYFS_SUCCESS) { + continue; + } + + /* if we read all of the bytes, we're done */ + if (req->nread == req->length) { + continue; + } + + /* otherwise, we have a short read, check whether there + * would be a hole after us, in which case we fill the + * request buffer with zeros */ + + /* get file size for this file */ + off_t filesize_offt = unifyfs_gfid_filesize(req->gfid); + if (filesize_offt == (off_t)-1) { + /* failed to get file size */ + req->errcode = ENOENT; + continue; + } + size_t filesize = (size_t)filesize_offt; + + /* get offset of where hole starts */ + size_t gap_start = req->offset + req->nread; + + /* get last offset of the read request */ + size_t req_end = req->offset + req->length; + + /* if file size is larger than last offset we wrote to in + * read request, then there is a hole we can fill */ + if (filesize > gap_start) { + /* assume we can fill the full request with zero */ + size_t gap_length = req_end - gap_start; + if (req_end > filesize) { + /* request is trying to read past end of file, + * so only fill zeros up to end of file */ + gap_length = filesize - gap_start; + } + + /* copy zeros into request buffer */ + LOGDBG("zero-filling hole at offset %zu of length %zu", + gap_start, gap_length); + char* req_ptr = req->buf + req->nread; + memset(req_ptr, 0, gap_length); + + /* update number of bytes read */ + req->nread += gap_length; + } + } + + /* if we attempted to service requests from our local extent map, + * then we need to copy the resulting read requests from the local + * and server arrays back into the user's original array */ + if (unifyfs_local_extents) { + /* TODO: would be nice to copy these back into the same order + * in which we received them. */ + + /* copy locally completed requests back into user's array */ + int local_count = in_count - count; + if (local_count > 0) { + memcpy(in_reqs, local_reqs, local_count * sizeof(read_req_t)); + } + + /* copy sever completed requests back into user's array */ + if (count > 0) { + /* skip past any items we copied in from the local requests */ + read_req_t* in_ptr = in_reqs + local_count; + memcpy(in_ptr, read_reqs, count * sizeof(read_req_t)); + } + + /* free storage we used for copies of requests */ + if (reqs != NULL) { + free(reqs); + reqs = NULL; + } + } + + return rc; +} + +/* ======================================= + * Operations on file ids + * ======================================= */ + +/* checks to see if fid is a directory + * returns 1 for yes + * returns 0 for no */ +int unifyfs_fid_is_dir(int fid) +{ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if ((meta != NULL) && (meta->mode & S_IFDIR)) { + return 1; + } else { + /* if it doesn't exist, then it's not a directory? */ + return 0; + } +} + +int unifyfs_gfid_from_fid(const int fid) +{ + /* check that local file id is in range */ + if (fid < 0 || fid >= unifyfs_max_files) { + return -1; + } + + /* return global file id, cached in file meta struct */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if (meta != NULL) { + return meta->gfid; + } else { + return -1; + } +} + +/* scan list of files and return fid corresponding to target gfid, + * returns -1 if not found */ +int unifyfs_fid_from_gfid(int gfid) +{ + int i; + for (i = 0; i < unifyfs_max_files; i++) { + if (unifyfs_filelist[i].in_use && + unifyfs_filemetas[i].gfid == gfid) { + /* found a file id that's in use and it matches + * the target fid, this is the one */ + return i; + } + } + return -1; +} + +/* Given a fid, return the path. */ +const char* unifyfs_path_from_fid(int fid) +{ + unifyfs_filename_t* fname = &unifyfs_filelist[fid]; + if (fname->in_use) { + return fname->filename; + } + return NULL; +} + +/* checks to see if a directory is empty + * assumes that check for is_dir has already been made + * only checks for full path matches, does not check relative paths, + * e.g. ../dirname will not work + * returns 1 for yes it is empty + * returns 0 for no */ +int unifyfs_fid_is_dir_empty(const char* path) +{ + int i = 0; + while (i < unifyfs_max_files) { + /* only check this element if it's active */ + if (unifyfs_filelist[i].in_use) { + /* if the file starts with the path, it is inside of that directory + * also check that it's not the directory entry itself */ + char* strptr = strstr(path, unifyfs_filelist[i].filename); + if (strptr == unifyfs_filelist[i].filename && + strcmp(path, unifyfs_filelist[i].filename) != 0) { + /* found a child item in path */ + LOGDBG("File found: unifyfs_filelist[%d].filename = %s", + i, (char*)&unifyfs_filelist[i].filename); + return 0; + } + } + + /* go on to next file */ + i++; + } + + /* couldn't find any files with this prefix, dir must be empty */ + return 1; +} + +/* Return the global (laminated) size of the file */ +off_t unifyfs_fid_global_size(int fid) +{ + /* get meta data for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if (meta != NULL) { + return meta->global_size; + } + return (off_t)-1; +} + +/* + * Return the size of the file. If the file is laminated, return the + * laminated size. If the file is not laminated, return the local + * size. + */ +off_t unifyfs_fid_logical_size(int fid) +{ + /* get meta data for this file */ + if (unifyfs_fid_is_laminated(fid)) { + return unifyfs_fid_global_size(fid); + } else { + /* invoke an rpc to ask the server what the file size is */ + + /* sync any writes to disk before requesting file size */ + unifyfs_fid_sync(fid); + + /* get file size for this file */ + size_t filesize; + int gfid = unifyfs_gfid_from_fid(fid); + int ret = invoke_client_filesize_rpc(gfid, &filesize); + if (ret != UNIFYFS_SUCCESS) { + /* failed to get file size */ + return (off_t)-1; + } + return (off_t)filesize; + } +} + +/* if we have a local fid structure corresponding to the gfid + * in question, we attempt the file lookup with the fid method + * otherwise call back to the rpc */ +off_t unifyfs_gfid_filesize(int gfid) +{ + off_t filesize = (off_t)-1; + + /* see if we have a fid for this gfid */ + int fid = unifyfs_fid_from_gfid(gfid); + if (fid >= 0) { + /* got a fid, look up file size through that + * method, since it may avoid a server rpc call */ + filesize = unifyfs_fid_logical_size(fid); + } else { + /* no fid for this gfid, + * look it up with server rpc */ + size_t size; + int ret = invoke_client_filesize_rpc(gfid, &size); + if (ret == UNIFYFS_SUCCESS) { + /* got the file size successfully */ + filesize = size; + } + } + + return filesize; +} + +/* Update local metadata for file from global metadata */ +int unifyfs_fid_update_file_meta(int fid, unifyfs_file_attr_t* gfattr) +{ + if (NULL == gfattr) { + return UNIFYFS_FAILURE; + } + + /* lookup local metadata for file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if (meta != NULL) { + /* update lamination state */ + meta->is_laminated = gfattr->is_laminated; + if (meta->is_laminated) { + /* update file size */ + meta->global_size = (off_t)gfattr->size; + LOGDBG("laminated file size is %zu bytes", + (size_t)meta->global_size); + } + return UNIFYFS_SUCCESS; + } + /* else, bad fid */ + return UNIFYFS_FAILURE; +} + +/* + * Set the metadata values for a file (after optionally creating it). + * The gfid for the file is in f_meta->gfid. + * + * gfid: The global file id on which to set metadata. + * + * op: If set to FILE_ATTR_OP_CREATE, attempt to create the file first. + * If the file already exists, then update its metadata with the values + * from fid filemeta. If not creating and the file does not exist, + * then the server will return an error. + * + * gfattr: The metadata values to store. + */ +int unifyfs_set_global_file_meta(int gfid, + unifyfs_file_attr_op_e attr_op, + unifyfs_file_attr_t* gfattr) +{ + /* check that we have an input buffer */ + if (NULL == gfattr) { + return UNIFYFS_FAILURE; + } + + /* force the gfid field value to match the gfid we're + * submitting this under */ + gfattr->gfid = gfid; + + /* send file attributes to server */ + int ret = invoke_client_metaset_rpc(attr_op, gfattr); + return ret; +} + +int unifyfs_get_global_file_meta(int gfid, unifyfs_file_attr_t* gfattr) +{ + /* check that we have an output buffer to write to */ + if (NULL == gfattr) { + return UNIFYFS_FAILURE; + } + + /* attempt to lookup file attributes in key/value store */ + unifyfs_file_attr_t fmeta; + int ret = invoke_client_metaget_rpc(gfid, &fmeta); + if (ret == UNIFYFS_SUCCESS) { + /* found it, copy attributes to output struct */ + *gfattr = fmeta; + } + return ret; +} + +/* + * Set the metadata values for a file (after optionally creating it), + * using metadata associated with a given local file id. + * + * fid: The local file id on which to base global metadata values. + * + * op: If set to FILE_ATTR_OP_CREATE, attempt to create the file first. + * If the file already exists, then update its metadata with the values + * from fid filemeta. If not creating and the file does not exist, + * then the server will return an error. + */ +int unifyfs_set_global_file_meta_from_fid(int fid, unifyfs_file_attr_op_e op) +{ + /* initialize an empty file attributes structure */ + unifyfs_file_attr_t fattr; + unifyfs_file_attr_set_invalid(&fattr); + + /* lookup local metadata for file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); + + /* get file name */ + char* filename = (char*) unifyfs_path_from_fid(fid); + + /* set global file id */ + fattr.gfid = meta->gfid; + + LOGDBG("setting global file metadata for fid:%d gfid:%d path:%s", + fid, fattr.gfid, filename); + + /* use current time for atime/mtime/ctime */ + struct timespec tp = {0}; + clock_gettime(CLOCK_REALTIME, &tp); + fattr.atime = tp; + fattr.mtime = tp; + fattr.ctime = tp; + + /* copy file mode bits */ + fattr.mode = meta->mode; + + if (op == UNIFYFS_FILE_ATTR_OP_CREATE) { + /* these fields are set by server, except when we're creating a + * new file in which case we should initialize them both to 0 */ + fattr.is_laminated = 0; + fattr.size = 0; + + /* capture current uid and gid */ + fattr.uid = getuid(); + fattr.gid = getgid(); + + fattr.filename = filename; + } + + LOGDBG("using following attributes"); + debug_print_file_attr(&fattr); + + /* submit file attributes to global key/value store */ + int ret = unifyfs_set_global_file_meta(meta->gfid, op, &fattr); + return ret; +} + +/* allocate a file id slot for a new file + * return the fid or -1 on error */ +int unifyfs_fid_alloc(void) +{ + unifyfs_stack_lock(); + int fid = unifyfs_stack_pop(free_fid_stack); + unifyfs_stack_unlock(); + LOGDBG("unifyfs_stack_pop() gave %d", fid); + if (fid < 0) { + /* need to create a new file, but we can't */ + LOGERR("unifyfs_stack_pop() failed (%d)", fid); + return -EMFILE; + } + return fid; +} + +/* return the file id back to the free pool */ +int unifyfs_fid_free(int fid) +{ + unifyfs_stack_lock(); + unifyfs_stack_push(free_fid_stack, fid); unifyfs_stack_unlock(); return UNIFYFS_SUCCESS; } @@ -927,37 +1570,39 @@ int unifyfs_fid_free(int fid) * returns the new fid, or negative value on error */ int unifyfs_fid_create_file(const char* path) { + /* check that pathname is within bounds */ + size_t pathlen = strlen(path) + 1; + if (pathlen > UNIFYFS_MAX_FILENAME) { + return -ENAMETOOLONG; + } + + /* allocate an id for this file */ int fid = unifyfs_fid_alloc(); if (fid < 0) { - /* was there an error? if so, return it */ - errno = ENOSPC; return fid; } - /* mark this slot as in use and copy the filename */ + /* mark this slot as in use */ unifyfs_filelist[fid].in_use = 1; - /* TODO: check path length to see if it is < 128 bytes - * and return appropriate error if it is greater - */ - /* copy file name into slot */ strcpy((void*)&unifyfs_filelist[fid].filename, path); - LOGDBG("Filename %s got unifyfs fd %d", + LOGDBG("Filename %s got unifyfs fid %d", unifyfs_filelist[fid].filename, fid); /* initialize meta data */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - meta->size = 0; - meta->chunks = 0; - meta->log_size = 0; - meta->storage = FILE_STORAGE_NULL; - meta->needs_sync = 0; + assert(meta != NULL); + meta->global_size = 0; meta->flock_status = UNLOCKED; + meta->storage = FILE_STORAGE_NULL; + meta->fid = fid; + meta->gfid = unifyfs_generate_gfid(path); + meta->needs_sync = 0; meta->is_laminated = 0; - meta->mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; + meta->mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; - /* PTHREAD_PROCESS_SHARED allows Process-Shared Synchronization*/ + /* PTHREAD_PROCESS_SHARED allows Process-Shared Synchronization */ pthread_spin_init(&meta->fspinlock, PTHREAD_PROCESS_SHARED); return fid; @@ -965,33 +1610,35 @@ int unifyfs_fid_create_file(const char* path) int unifyfs_fid_create_directory(const char* path) { - int ret = 0; - int fid = 0; - int gfid = 0; - int found_global = 0; - int found_local = 0; + /* check that pathname is within bounds */ size_t pathlen = strlen(path) + 1; - struct stat sb = { 0, }; - unifyfs_file_attr_t gfattr = { 0, }; - unifyfs_filemeta_t* meta = NULL; - if (pathlen > UNIFYFS_MAX_FILENAME) { - return (int) UNIFYFS_ERROR_NAMETOOLONG; + return (int) ENAMETOOLONG; } - fid = unifyfs_get_fid_from_path(path); - gfid = unifyfs_generate_gfid(path); + /* get local and global file ids */ + int fid = unifyfs_get_fid_from_path(path); + int gfid = unifyfs_generate_gfid(path); - found_global = - (unifyfs_get_global_file_meta(fid, gfid, &gfattr) == UNIFYFS_SUCCESS); - found_local = (fid >= 0); + /* test whether we have info for file in our local file list */ + int found_local = (fid >= 0); - if (found_local && found_global) { - return (int) UNIFYFS_ERROR_EXIST; + /* test whether we have metadata for file in global key/value store */ + int found_global = 0; + unifyfs_file_attr_t gfattr = { 0, }; + if (unifyfs_get_global_file_meta(gfid, &gfattr) == UNIFYFS_SUCCESS) { + found_global = 1; } - if (found_local && !found_global) { - /* FIXME: so, we have detected the cache inconsistency here. + /* can't create if it already exists */ + if (found_global) { + return EEXIST; + } + + if (found_local) { + /* exists locally, but not globally + * + * FIXME: so, we have detected the cache inconsistency here. * we cannot simply unlink or remove the entry because then we also * need to check whether any subdirectories or files exist. * @@ -1002,72 +1649,50 @@ int unifyfs_fid_create_directory(const char* path) * deletes the global entry without checking any local used entries * in other processes. * - * we currently return EIO, and this needs to be addressed according to - * a consistency model this fs intance assumes. + * we currently return EEXIS, and this needs to be addressed according + * to a consistency model this fs intance assumes. */ - return (int) UNIFYFS_ERROR_IO; - } - - if (!found_local && found_global) { - /* populate the local cache, then return EEXIST */ - - return (int) UNIFYFS_ERROR_EXIST; + return EEXIST; } /* now, we need to create a new directory. */ fid = unifyfs_fid_create_file(path); if (fid < 0) { - return (int) UNIFYFS_ERROR_IO; /* FIXME: ENOSPC or EIO? */ + return -fid; } - meta = unifyfs_get_meta_from_fid(fid); - /* Set as directory */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); meta->mode = (meta->mode & ~S_IFREG) | S_IFDIR; - ret = unifyfs_set_global_file_meta(fid, gfid); - if (ret) { + /* insert global meta data for directory */ + unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; + int ret = unifyfs_set_global_file_meta_from_fid(fid, op); + if (ret != UNIFYFS_SUCCESS) { LOGERR("Failed to populate the global meta entry for %s (fid:%d)", path, fid); - return (int) UNIFYFS_ERROR_IO; + return ret; } return UNIFYFS_SUCCESS; } -/* read count bytes from file starting from pos and store into buf, - * all bytes are assumed to exist, so checks on file size should be - * done before calling this routine */ -int unifyfs_fid_read(int fid, off_t pos, void* buf, size_t count) +/* Write count bytes from buf into file starting at offset pos. + * + * Returns UNIFYFS_SUCCESS, or an error code + */ +int unifyfs_fid_write( + int fid, /* local file id to write to */ + off_t pos, /* starting position in file */ + const void* buf, /* buffer to be written */ + size_t count, /* number of bytes to write */ + size_t* nwritten) /* returns number of bytes written */ { int rc; - /* short-circuit a 0-byte read */ - if (count == 0) { - return UNIFYFS_SUCCESS; - } - - /* get meta for this file id */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - - /* determine storage type to read file data */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - /* file stored in fixed-size chunks */ - rc = unifyfs_fid_store_fixed_read(fid, meta, pos, buf, count); - } else { - /* unknown storage type */ - rc = (int)UNIFYFS_ERROR_IO; - } - - return rc; -} - -/* write count bytes from buf into file starting at offset pos, - * all bytes are assumed to be allocated to file, so file should - * be extended before calling this routine */ -int unifyfs_fid_write(int fid, off_t pos, const void* buf, size_t count) -{ - int rc; + /* assume we won't write anything */ + *nwritten = 0; /* short-circuit a 0-byte write */ if (count == 0) { @@ -1076,111 +1701,30 @@ int unifyfs_fid_write(int fid, off_t pos, const void* buf, size_t count) /* get meta for this file id */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); /* determine storage type to write file data */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK || - meta->storage == FILE_STORAGE_LOGIO) { - /* file stored in fixed-size chunks */ - rc = unifyfs_fid_store_fixed_write(fid, meta, pos, buf, count); - } else { - /* unknown storage type */ - rc = (int)UNIFYFS_ERROR_IO; - } - - return rc; -} - -/* given a file id, write zero bytes to region of specified offset - * and length, assumes space is already reserved */ -int unifyfs_fid_write_zero(int fid, off_t pos, off_t count) -{ - int rc = UNIFYFS_SUCCESS; - - /* allocate an aligned chunk of memory */ - size_t buf_size = 1024 * 1024; - void* buf = (void*) malloc(buf_size); - if (buf == NULL) { - return (int)UNIFYFS_ERROR_IO; - } - - /* set values in this buffer to zero */ - memset(buf, 0, buf_size); - - /* write zeros to file */ - off_t written = 0; - off_t curpos = pos; - while (written < count) { - /* compute number of bytes to write on this iteration */ - size_t num = buf_size; - off_t remaining = count - written; - if (remaining < (off_t) buf_size) { - num = (size_t) remaining; - } - - /* write data to file */ - int write_rc = unifyfs_fid_write(fid, curpos, buf, num); - if (write_rc != UNIFYFS_SUCCESS) { - rc = (int)UNIFYFS_ERROR_IO; - break; - } - - /* update the number of bytes written */ - curpos += (off_t) num; - written += (off_t) num; - } - - /* free the buffer */ - free(buf); - - return rc; -} - -/* increase size of file if length is greater than current size, - * and allocate additional chunks as needed to reserve space for - * length bytes */ -int unifyfs_fid_extend(int fid, off_t length) -{ - int rc; - - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - - /* determine file storage type */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK || - meta->storage == FILE_STORAGE_LOGIO) { - /* file stored in fixed-size chunks */ - rc = unifyfs_fid_store_fixed_extend(fid, meta, length); - } else { - /* unknown storage type */ - rc = (int)UNIFYFS_ERROR_IO; - } - - /* TODO: move this statement elsewhere */ - /* increase file size up to length */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - if (length > meta->size) { - meta->size = length; + if (meta->storage == FILE_STORAGE_LOGIO) { + /* file stored in logged i/o */ + rc = unifyfs_fid_logio_write(fid, meta, pos, buf, count, nwritten); + if (rc == UNIFYFS_SUCCESS) { + /* write succeeded, remember that we have new data + * that needs to be synced with the server */ + meta->needs_sync = 1; + + /* optionally sync after every write */ + if (unifyfs_write_sync) { + int ret = unifyfs_sync(fid); + if (ret) { + LOGERR("client sync after write failed"); + rc = ret; + } + } } - } - - return rc; -} - -/* if length is less than reserved space, give back space down to length */ -int unifyfs_fid_shrink(int fid, off_t length) -{ - int rc; - - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - - /* determine file storage type */ - if (meta->storage == FILE_STORAGE_FIXED_CHUNK) { - /* file stored in fixed-size chunks */ - rc = unifyfs_fid_store_fixed_shrink(fid, meta, length); } else { /* unknown storage type */ - rc = (int)UNIFYFS_ERROR_IO; + LOGERR("unknown storage type for fid=%d", fid); + rc = EIO; } return rc; @@ -1193,57 +1737,58 @@ int unifyfs_fid_truncate(int fid, off_t length) { /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); + if (meta->is_laminated) { + /* Can't truncate a laminated file */ + return EINVAL; + } - /* get current size of file */ - off_t size = meta->size; + if (meta->storage != FILE_STORAGE_LOGIO) { + /* unknown storage type */ + return EIO; + } - /* drop data if length is less than current size, - * allocate new space and zero fill it if bigger */ - if (length < size) { - /* determine the number of chunks to leave after truncating */ - int shrink_rc = unifyfs_fid_shrink(fid, length); - if (shrink_rc != UNIFYFS_SUCCESS) { - return shrink_rc; - } - } else if (length > size) { - /* file size has been extended, allocate space */ - int extend_rc = unifyfs_fid_extend(fid, length); - if (extend_rc != UNIFYFS_SUCCESS) { - return (int)UNIFYFS_ERROR_NOSPC; - } + /* remove/update writes past truncation size for this file id */ + int rc = truncate_write_meta(meta, length); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } - /* write zero values to new bytes */ - off_t gap_size = length - size; - int zero_rc = unifyfs_fid_write_zero(fid, size, gap_size); - if (zero_rc != UNIFYFS_SUCCESS) { - return (int)UNIFYFS_ERROR_IO; - } + /* truncate is a sync point */ + rc = unifyfs_fid_sync(fid); + if (rc != UNIFYFS_SUCCESS) { + return rc; } - /* set the new size */ - meta->size = length; + /* update global size in filemeta to reflect truncated size. + * note that log size is not affected */ + meta->global_size = length; + + /* invoke truncate rpc */ + int gfid = unifyfs_gfid_from_fid(fid); + rc = invoke_client_truncate_rpc(gfid, length); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } return UNIFYFS_SUCCESS; } -/* - * hash a path to gfid - * @param path: file path - * return: error code, gfid - * */ -static int unifyfs_get_global_fid(const char* path, int* gfid) +/* sync data for file id to server if needed */ +int unifyfs_fid_sync(int fid) { - MD5_CTX ctx; - - unsigned char md[16]; - memset(md, 0, 16); + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; - MD5_Init(&ctx); - MD5_Update(&ctx, path, strlen(path)); - MD5_Final(md, &ctx); + /* sync any writes to disk */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); + if (meta->needs_sync) { + /* sync data with server */ + ret = unifyfs_sync(fid); + } - *gfid = *((int*)md); - return UNIFYFS_SUCCESS; + return ret; } /* opens a new file id with specified path, access flags, and permissions, @@ -1253,18 +1798,15 @@ static int unifyfs_get_global_fid(const char* path, int* gfid) int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, off_t* outpos) { - /* check that path is short enough */ - int ret = 0; - size_t pathlen = strlen(path) + 1; - int fid = 0; - int gfid = -1; - int found_global = 0; - int found_local = 0; - off_t pos = 0; /* set the pointer to the start of the file */ - unifyfs_file_attr_t gfattr = { 0, }; + int ret; + /* set the pointer to the start of the file */ + off_t pos = 0; + + /* check that pathname is within bounds */ + size_t pathlen = strlen(path) + 1; if (pathlen > UNIFYFS_MAX_FILENAME) { - return (int) UNIFYFS_ERROR_NAMETOOLONG; + return ENAMETOOLONG; } /* check whether this file already exists */ @@ -1275,14 +1817,21 @@ int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, * the broadcast for cache invalidation has not been implemented, yet. */ - gfid = unifyfs_generate_gfid(path); - fid = unifyfs_get_fid_from_path(path); + /* get local and global file ids */ + int fid = unifyfs_get_fid_from_path(path); + int gfid = unifyfs_generate_gfid(path); LOGDBG("unifyfs_get_fid_from_path() gave %d (gfid = %d)", fid, gfid); - found_global = - (unifyfs_get_global_file_meta(fid, gfid, &gfattr) == UNIFYFS_SUCCESS); - found_local = (fid >= 0); + /* test whether we have info for file in our local file list */ + int found_local = (fid >= 0); + + /* test whether we have metadata for file in global key/value store */ + int found_global = 0; + unifyfs_file_attr_t gfattr = { 0, }; + if (unifyfs_get_global_file_meta(gfid, &gfattr) == UNIFYFS_SUCCESS) { + found_global = 1; + } /* * Catch any case where we could potentially want to write to a laminated @@ -1290,8 +1839,8 @@ int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, */ if (gfattr.is_laminated && ((flags & (O_CREAT | O_TRUNC | O_APPEND | O_WRONLY)) || - (mode & 0222))) { - LOGDBG("Can't open with a writable flag on laminated file."); + ((mode & 0222) && (flags != O_RDONLY)))) { + LOGDBG("Can't open laminated file %s with a writable flag.", path); return EROFS; } @@ -1303,11 +1852,8 @@ int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, if (found_local && !found_global) { LOGDBG("file found locally, but seems to be deleted globally. " "invalidating the local cache."); - return EROFS; - unifyfs_fid_unlink(fid); - - return (int) UNIFYFS_ERROR_NOENT; + return ENOENT; } /* for all other three cases below, we need to open the file and allocate a @@ -1318,100 +1864,89 @@ int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, * create a local meta cache and also initialize the local storage * space. */ - unifyfs_filemeta_t* meta = NULL; + /* initialize local metadata for this file */ fid = unifyfs_fid_create_file(path); if (fid < 0) { LOGERR("failed to create a new file %s", path); - - /* FIXME: UNIFYFS_ERROR_NFILE or UNIFYFS_ERROR_IO ? */ - return (int) UNIFYFS_ERROR_IO; + return -fid; } - ret = unifyfs_fid_store_alloc(fid); + /* initialize local storage for this file */ + ret = fid_store_alloc(fid); if (ret != UNIFYFS_SUCCESS) { LOGERR("failed to allocate storage space for file %s (fid=%d)", path, fid); - return (int) UNIFYFS_ERROR_IO; + return ret; } - meta = unifyfs_get_meta_from_fid(fid); - - meta->size = gfattr.size; - gfattr.fid = fid; - gfattr.gfid = gfid; - - ins_file_meta(&unifyfs_fattrs, &gfattr); + /* initialize global size of file from global metadata */ + unifyfs_fid_update_file_meta(fid, &gfattr); } else if (found_local && found_global) { /* file exists and is valid. */ if ((flags & O_CREAT) && (flags & O_EXCL)) { - return (int)UNIFYFS_ERROR_EXIST; + return EEXIST; } if ((flags & O_DIRECTORY) && !unifyfs_fid_is_dir(fid)) { - return (int)UNIFYFS_ERROR_NOTDIR; + return ENOTDIR; } if (!(flags & O_DIRECTORY) && unifyfs_fid_is_dir(fid)) { - return (int)UNIFYFS_ERROR_NOTDIR; + return EISDIR; } + /* update local metadata from global metadata */ + unifyfs_fid_update_file_meta(fid, &gfattr); + if ((flags & O_TRUNC) && (flags & (O_RDWR | O_WRONLY))) { unifyfs_fid_truncate(fid, 0); } if (flags & O_APPEND) { - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - pos = meta->size; + /* We only support O_APPEND on non-laminated files */ + pos = unifyfs_fid_logical_size(fid); } } else { /* !found_local && !found_global * If we reach here, we need to create a brand new file. */ - struct stat sb = { 0, }; - if (!(flags & O_CREAT)) { LOGERR("%s does not exist (O_CREAT not given).", path); - return (int) UNIFYFS_ERROR_NOENT; + return ENOENT; } - - LOGDBG("Creating a new entry for %s.", path); - LOGDBG("shm_super_buf = %p; free_fid_stack = %p; " - "free_chunk_stack = %p; unifyfs_filelist = %p; " - "chunks = %p", shm_super_buf, free_fid_stack, - free_chunk_stack, unifyfs_filelist, unifyfs_chunks); + LOGDBG("Creating a new entry for %s", path); /* allocate a file id slot for this new file */ fid = unifyfs_fid_create_file(path); if (fid < 0) { LOGERR("Failed to create new file %s", path); - return (int) UNIFYFS_ERROR_NFILE; + return -fid; } /* initialize the storage for the file */ - int store_rc = unifyfs_fid_store_alloc(fid); + int store_rc = fid_store_alloc(fid); if (store_rc != UNIFYFS_SUCCESS) { LOGERR("Failed to create storage for file %s", path); - return (int) UNIFYFS_ERROR_IO; + return store_rc; } - /*create a file and send its attribute to key-value store*/ - ret = unifyfs_set_global_file_meta(fid, gfid); - if (ret) { + /* insert file attribute for file in key-value store */ + unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; + ret = unifyfs_set_global_file_meta_from_fid(fid, op); + if (ret != UNIFYFS_SUCCESS) { LOGERR("Failed to populate the global meta entry for %s (fid:%d)", path, fid); - return (int) UNIFYFS_ERROR_IO; + return ret; } } /* TODO: allocate a free file descriptor and associate it with fid set - * in_use flag and file pointer - */ + * in_use flag and file pointer */ + + /* return local file id and starting file position */ *outfid = fid; *outpos = pos; - - LOGDBG("UNIFYFS_open generated fd %d for file %s", fid, path); - return UNIFYFS_SUCCESS; } @@ -1426,16 +1961,19 @@ int unifyfs_fid_close(int fid) /* delete a file id and return file its resources to free pools */ int unifyfs_fid_unlink(int fid) { - /* return data to free pools */ - int rc = unifyfs_fid_truncate(fid, 0); + int rc; + + /* invoke unlink rpc */ + int gfid = unifyfs_gfid_from_fid(fid); + rc = invoke_client_unlink_rpc(gfid); if (rc != UNIFYFS_SUCCESS) { - /* failed to release storage for the file, - * so bail out to keep its file id active */ + /* TODO: if item does not exist globally, but just locally, + * we still want to delete item locally */ return rc; } /* finalize the storage we're using for this file */ - rc = unifyfs_fid_store_free(fid); + rc = fid_store_free(fid); if (rc != UNIFYFS_SUCCESS) { /* released strorage for file, but failed to release * structures tracking storage, again bail out to keep @@ -1461,16 +1999,17 @@ int unifyfs_fid_unlink(int fid) return UNIFYFS_SUCCESS; } -/* --------------------------------------- - * Operations to mount file system - * --------------------------------------- */ +/* ======================================= + * Operations to mount/unmount file system + * ======================================= */ + +/* ------------- + * static APIs + * ------------- */ /* The super block is a region of shared memory that is used to - * persist file system data. It contains both room for data - * structures used to track file names, meta data, the list of - * storage blocks used for each file, and optional blocks. - * It also contains a fixed-size region for keeping log - * index entries and stat info for each file. + * persist file system meta data. It also contains a fixed-size + * region for keeping log index entries for each file. * * - stack of free local file ids of length max_files, * the local file id is used to index into other data @@ -1481,37 +2020,18 @@ int unifyfs_fid_unlink(int fid) * slot is in use and if so, the current file name * * - array of unifyfs_filemeta structs, indexed by local - * file id, records list of storage blocks used to - * store data for the file - * - * - array of unifyfs_chunkmeta structs, indexed by local - * file id and then by chunk id for recording metadata - * of each chunk allocated to a file, including host - * storage and id of that chunk within its storage - * - * - stack to track free list of memory chunks - * - * - stack to track free list of spillover chunks - * - * - array of storage chunks of length unifyfs_max_chunks, - * if storing data in memory + * file id * * - count of number of active index entries * - array of index metadata to track physical offset * of logical file data, of length unifyfs_max_index_entries, * entries added during write operations - * - * - count of number of active file metadata entries - * - array of file metadata to track stat info for each - * file, of length unifyfs_max_fattr_entries, filed - * in by client and read by server to record file meta - * data */ /* compute memory size of superblock in bytes, * critical to keep this consistent with - * unifyfs_init_pointers */ -static size_t unifyfs_superblock_size(void) + * init_superblock_pointers */ +static size_t get_superblock_size(void) { size_t sb_size = 0; @@ -1528,41 +2048,10 @@ static size_t unifyfs_superblock_size(void) /* file metadata struct array */ sb_size += unifyfs_max_files * sizeof(unifyfs_filemeta_t); - if (unifyfs_use_memfs) { - /* memory chunk metadata struct array for each file, - * enables a file to use all space in memory */ - sb_size += unifyfs_max_files * unifyfs_max_chunks * - sizeof(unifyfs_chunkmeta_t); - } - if (unifyfs_use_spillover) { - /* spillover chunk metadata struct array for each file, - * enables a file to use all space in spillover file */ - sb_size += unifyfs_max_files * unifyfs_spillover_max_chunks * - sizeof(unifyfs_chunkmeta_t); - } - - /* free chunk stack */ - if (unifyfs_use_memfs) { - sb_size += unifyfs_stack_bytes(unifyfs_max_chunks); - } - if (unifyfs_use_spillover) { - sb_size += unifyfs_stack_bytes(unifyfs_spillover_max_chunks); - } - - /* space for memory chunks */ - if (unifyfs_use_memfs) { - sb_size += unifyfs_page_size; - sb_size += unifyfs_max_chunks * unifyfs_chunk_size; - } - /* index region size */ sb_size += unifyfs_page_size; sb_size += unifyfs_max_index_entries * sizeof(unifyfs_index_t); - /* attribute region size */ - sb_size += unifyfs_page_size; - sb_size += unifyfs_max_fattr_entries * sizeof(unifyfs_file_attr_t); - /* return number of bytes */ return sb_size; } @@ -1581,7 +2070,7 @@ char* next_page_align(char* ptr) } /* initialize our global pointers into the given superblock */ -static void* unifyfs_init_pointers(void* superblock) +static void init_superblock_pointers(void* superblock) { char* ptr = (char*)superblock; @@ -1601,37 +2090,6 @@ static void* unifyfs_init_pointers(void* superblock) unifyfs_filemetas = (unifyfs_filemeta_t*)ptr; ptr += unifyfs_max_files * sizeof(unifyfs_filemeta_t); - /* array of chunk meta data strucutres for each file */ - unifyfs_chunkmetas = (unifyfs_chunkmeta_t*)ptr; - if (unifyfs_use_memfs) { - ptr += unifyfs_max_files * unifyfs_max_chunks * - sizeof(unifyfs_chunkmeta_t); - } - if (unifyfs_use_spillover) { - ptr += unifyfs_max_files * unifyfs_spillover_max_chunks * - sizeof(unifyfs_chunkmeta_t); - } - - /* stack to manage free memory data chunks */ - if (unifyfs_use_memfs) { - free_chunk_stack = ptr; - ptr += unifyfs_stack_bytes(unifyfs_max_chunks); - } - if (unifyfs_use_spillover) { - free_spillchunk_stack = ptr; - ptr += unifyfs_stack_bytes(unifyfs_spillover_max_chunks); - } - - /* Only set this up if we're using memfs */ - if (unifyfs_use_memfs) { - /* pointer to start of memory data chunks */ - ptr = next_page_align(ptr); - unifyfs_chunks = ptr; - ptr += unifyfs_max_chunks * unifyfs_chunk_size; - } else { - unifyfs_chunks = NULL; - } - /* record pointer to number of index entries */ unifyfs_indices.ptr_num_entries = (size_t*)ptr; @@ -1640,136 +2098,140 @@ static void* unifyfs_init_pointers(void* superblock) unifyfs_indices.index_entry = (unifyfs_index_t*)ptr; ptr += unifyfs_max_index_entries * sizeof(unifyfs_index_t); - /* pointer to number of file metadata entries */ - unifyfs_fattrs.ptr_num_entries = (size_t*)ptr; - - /* pointer to array of file metadata entries */ - ptr += unifyfs_page_size; - unifyfs_fattrs.meta_entry = (unifyfs_file_attr_t*)ptr; - ptr += unifyfs_max_fattr_entries * sizeof(unifyfs_file_attr_t); - /* compute size of memory we're using and check that * it matches what we allocated */ size_t ptr_size = (size_t)(ptr - (char*)superblock); - if (ptr_size > shm_super_size) { + if (ptr_size > shm_super_ctx->size) { LOGERR("Data structures in superblock extend beyond its size"); } - - return ptr; } /* initialize data structures for first use */ -static int unifyfs_init_structures() +static int init_superblock_structures(void) { - /* compute total number of storage chunks available */ - int numchunks = 0; - if (unifyfs_use_memfs) { - numchunks += unifyfs_max_chunks; - } - if (unifyfs_use_spillover) { - numchunks += unifyfs_spillover_max_chunks; - } - int i; for (i = 0; i < unifyfs_max_files; i++) { /* indicate that file id is not in use by setting flag to 0 */ unifyfs_filelist[i].in_use = 0; - - /* set pointer to array of chunkmeta data structures */ - unifyfs_filemeta_t* filemeta = &unifyfs_filemetas[i]; - - /* compute offset to start of chunk meta list for this file */ - filemeta->chunkmeta_idx = numchunks * i; } /* initialize stack of free file ids */ unifyfs_stack_init(free_fid_stack, unifyfs_max_files); - /* initialize list of free memory chunks */ - if (unifyfs_use_memfs) { - unifyfs_stack_init(free_chunk_stack, unifyfs_max_chunks); - } - - /* initialize list of free spillover chunks */ - if (unifyfs_use_spillover) { - unifyfs_stack_init(free_spillchunk_stack, unifyfs_spillover_max_chunks); - } - /* initialize count of key/value entries */ *(unifyfs_indices.ptr_num_entries) = 0; - /* initialize count of file stat structures */ - *(unifyfs_fattrs.ptr_num_entries) = 0; - LOGDBG("Meta-stacks initialized!"); return UNIFYFS_SUCCESS; } -static int unifyfs_get_spillblock(size_t size, const char* path) -{ - //MAP_OR_FAIL(open); - mode_t perms = unifyfs_getmode(0); - int spillblock_fd = __real_open(path, O_RDWR | O_CREAT | O_EXCL, perms); - if (spillblock_fd < 0) { - if (errno == EEXIST) { - /* spillover block exists; attach and return */ - spillblock_fd = __real_open(path, O_RDWR); - } else { - LOGERR("open() failed: errno=%d (%s)", errno, strerror(errno)); - return -1; - } - } else { - /* new spillover block created */ - /* TODO: align to SSD block size*/ - - /*temp*/ - off_t rc = __real_lseek(spillblock_fd, size, SEEK_SET); - if (rc < 0) { - LOGERR("lseek() failed: errno=%d (%s)", errno, strerror(errno)); - } - } - - return spillblock_fd; -} - /* create superblock of specified size and name, or attach to existing * block if available */ -static void* unifyfs_superblock_shmget(size_t size, key_t key) +static int init_superblock_shm(size_t super_sz) { - /* define name for superblock shared memory region */ - snprintf(shm_super_name, sizeof(shm_super_name), "%d-super-%d", - app_id, key); - LOGDBG("Key for superblock = %x", key); + char shm_name[SHMEM_NAME_LEN] = {0}; - /* open shared memory file */ - void* addr = unifyfs_shm_alloc(shm_super_name, size); - if (addr == NULL) { - LOGERR("Failed to create superblock"); - return NULL; + /* attach shmem region for client's superblock */ + sprintf(shm_name, SHMEM_SUPER_FMTSTR, unifyfs_app_id, unifyfs_client_id); + shm_context* shm_ctx = unifyfs_shm_alloc(shm_name, super_sz); + if (NULL == shm_ctx) { + LOGERR("Failed to attach to shmem superblock region %s", shm_name); + return UNIFYFS_ERROR_SHMEM; } + shm_super_ctx = shm_ctx; /* init our global variables to point to spots in superblock */ - unifyfs_init_pointers(addr); + void* addr = shm_ctx->addr; + init_superblock_pointers(addr); /* initialize structures in superblock if it's newly allocated, * we depend on shm_open setting all bytes to 0 to know that * it is not initialized */ - int32_t initialized = *(int32_t*)addr; + uint32_t initialized = *(uint32_t*)addr; if (initialized == 0) { /* not yet initialized, so initialize values within superblock */ - unifyfs_init_structures(); + init_superblock_structures(); /* superblock structure has been initialized, * so set flag to indicate that fact */ - *(int32_t*)addr = 0xDEADBEEF; + *(uint32_t*)addr = (uint32_t)0xDEADBEEF; + } else { + /* In this case, we have reattached to an existing superblock from + * an earlier run. We need to reset the segtree pointers to + * newly allocated segtrees, because they point to structures + * allocated in the last run whose memory addresses are no longer + * valid. */ + + /* TODO: what to do if a process calls unifyfs_init multiple times + * in a run? */ + + /* Clear any index entries from the cache. We do this to ensure + * the newly allocated seg trees are consistent with the extents + * in the index. It would be nice to call unifyfs_sync to flush + * any entries to the server, but we can't do that since that will + * try to rewrite the index using the trees, which point to invalid + * memory at this point. */ + /* initialize count of key/value entries */ + *(unifyfs_indices.ptr_num_entries) = 0; + + int i; + for (i = 0; i < unifyfs_max_files; i++) { + /* if the file entry is active, reset its segment trees */ + if (unifyfs_filelist[i].in_use) { + /* got a live file, get pointer to its metadata */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(i); + assert(meta != NULL); + + /* Reset our segment tree that will record our writes */ + seg_tree_init(&meta->extents_sync); + + /* Reset our segment tree to track extents for all writes + * by this process, can be used to read back local data */ + if (unifyfs_local_extents) { + seg_tree_init(&meta->extents); + } + } + } } /* return starting memory address of super block */ - return addr; + return UNIFYFS_SUCCESS; +} + +/** + * Initialize the shared recv memory buffer to receive data from the delegators + */ +static int init_recv_shm(void) +{ + char shm_recv_name[SHMEM_NAME_LEN] = {0}; + size_t shm_recv_size = UNIFYFS_DATA_RECV_SIZE; + + /* get size of shared memory region from configuration */ + char* cfgval = client_cfg.client_recv_data_size; + if (cfgval != NULL) { + long l; + int rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + shm_recv_size = (size_t) l; + } + } + + /* define file name to shared memory file */ + snprintf(shm_recv_name, sizeof(shm_recv_name), + SHMEM_DATA_FMTSTR, unifyfs_app_id, unifyfs_client_id); + + /* allocate memory for shared memory receive buffer */ + shm_recv_ctx = unifyfs_shm_alloc(shm_recv_name, shm_recv_size); + if (NULL == shm_recv_ctx) { + LOGERR("Failed to create buffer for read replies"); + return UNIFYFS_FAILURE; + } + + return UNIFYFS_SUCCESS; } -static int unifyfs_init(int rank) +static int unifyfs_init(void) { int rc; int i; @@ -1779,30 +2241,12 @@ static int unifyfs_init(int rank) char* cfgval; if (!unifyfs_initialized) { - /* unifyfs debug level default is zero */ - unifyfs_log_level = 0; - cfgval = client_cfg.log_verbosity; - if (cfgval != NULL) { - rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - unifyfs_log_level = (int)l; - } - } #ifdef UNIFYFS_GOTCHA - /* insert our I/O wrappers using gotcha */ - enum gotcha_error_t result; - result = gotcha_wrap(wrap_unifyfs_list, GOTCHA_NFUNCS, "unifyfs"); - if (result != GOTCHA_SUCCESS) { - LOGERR("gotcha_wrap returned %d", (int) result); - } - - /* check for an errors when registering functions with gotcha */ - for (i = 0; i < GOTCHA_NFUNCS; i++) { - if (*(void**)(wrap_unifyfs_list[i].function_address_pointer) == 0) { - LOGERR("This function name failed to be wrapped: %s", - wrap_unifyfs_list[i].name); - } + rc = setup_gotcha_wrappers(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to setup gotcha wrappers"); + return rc; } #endif @@ -1826,24 +2270,42 @@ static int unifyfs_init(int rank) unifyfs_max_long = LONG_MAX; unifyfs_min_long = LONG_MIN; - /* will we use spillover to store the files? */ - unifyfs_use_spillover = 1; - cfgval = client_cfg.spillover_enabled; + /* set our current working directory if user gave us one */ + cfgval = client_cfg.client_cwd; if (cfgval != NULL) { - rc = configurator_bool_val(cfgval, &b); - if ((rc == 0) && !b) { - unifyfs_use_spillover = 0; + unifyfs_cwd = strdup(cfgval); + + /* check that cwd falls somewhere under the mount point */ + int cwd_within_mount = 0; + if (strncmp(unifyfs_cwd, unifyfs_mount_prefix, + unifyfs_mount_prefixlen) == 0) { + /* characters in target up through mount point match, + * assume we match */ + cwd_within_mount = 1; + + /* if we have another character, it must be '/' */ + if (strlen(unifyfs_cwd) > unifyfs_mount_prefixlen && + unifyfs_cwd[unifyfs_mount_prefixlen] != '/') { + cwd_within_mount = 0; + } } - } - LOGDBG("are we using spillover? %d", unifyfs_use_spillover); - - /* determine maximum number of bytes of spillover for chunk storage */ - unifyfs_spillover_size = UNIFYFS_SPILLOVER_SIZE; - cfgval = client_cfg.spillover_size; - if (cfgval != NULL) { - rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - unifyfs_spillover_size = (size_t)l; + if (!cwd_within_mount) { + /* path given in CWD is outside of the UnifyFS mount point */ + LOGERR("UNIFYFS_CLIENT_CWD '%s' must be within the mount '%s'", + unifyfs_cwd, unifyfs_mount_prefix); + + /* ignore setting and set back to NULL */ + free(unifyfs_cwd); + unifyfs_cwd = NULL; + } + } else { + /* user did not specify a CWD, so initialize with the actual + * current working dir */ + char* cwd = getcwd(NULL, 0); + if (cwd != NULL) { + unifyfs_cwd = cwd; + } else { + LOGERR("Failed getcwd (%s)", strerror(errno)); } } @@ -1857,39 +2319,33 @@ static int unifyfs_init(int rank) } } - /* determine number of bits for chunk size */ - unifyfs_chunk_bits = UNIFYFS_CHUNK_BITS; - cfgval = client_cfg.shmem_chunk_bits; + /* Determine if we should track all write extents and use them + * to service read requests if all data is local */ + unifyfs_local_extents = 0; + cfgval = client_cfg.client_local_extents; if (cfgval != NULL) { - rc = configurator_int_val(cfgval, &l); + rc = configurator_bool_val(cfgval, &b); if (rc == 0) { - unifyfs_chunk_bits = (int)l; + unifyfs_local_extents = (bool)b; } } - /* determine maximum number of bytes of memory for chunk storage */ - unifyfs_chunk_mem = UNIFYFS_CHUNK_MEM; - cfgval = client_cfg.shmem_chunk_mem; + /* Determine whether we automatically sync every write to server. + * This slows write performance, but it can serve as a work + * around for apps that do not have all necessary syncs. */ + unifyfs_write_sync = false; + cfgval = client_cfg.client_write_sync; if (cfgval != NULL) { - rc = configurator_int_val(cfgval, &l); + rc = configurator_bool_val(cfgval, &b); if (rc == 0) { - unifyfs_chunk_mem = (size_t)l; + unifyfs_write_sync = (bool)b; } } - /* set chunk size, set chunk offset mask, and set total number - * of chunks */ - unifyfs_chunk_size = 1 << unifyfs_chunk_bits; - unifyfs_chunk_mask = unifyfs_chunk_size - 1; - unifyfs_max_chunks = unifyfs_chunk_mem >> unifyfs_chunk_bits; - - /* set number of chunks in spillover device */ - unifyfs_spillover_max_chunks = unifyfs_spillover_size >> unifyfs_chunk_bits; - /* define size of buffer used to cache key/value pairs for * data offsets before passing them to the server */ unifyfs_index_buf_size = UNIFYFS_INDEX_BUF_SIZE; - cfgval = client_cfg.logfs_index_buf_size; + cfgval = client_cfg.client_write_index_size; if (cfgval != NULL) { rc = configurator_int_val(cfgval, &l); if (rc == 0) { @@ -1899,42 +2355,6 @@ static int unifyfs_init(int rank) unifyfs_max_index_entries = unifyfs_index_buf_size / sizeof(unifyfs_index_t); - /* define size of buffer used to cache stat structures - * for files we create before passing this info - * to the server */ - unifyfs_fattr_buf_size = UNIFYFS_FATTR_BUF_SIZE; - cfgval = client_cfg.logfs_attr_buf_size; - if (cfgval != NULL) { - rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - unifyfs_fattr_buf_size = (size_t)l; - } - } - unifyfs_max_fattr_entries = - unifyfs_fattr_buf_size / sizeof(unifyfs_file_attr_t); - - /* if we're using NUMA, process some configuration settings */ -#ifdef HAVE_LIBNUMA - char* env = getenv("UNIFYFS_NUMA_POLICY"); - if (env) { - sprintf(unifyfs_numa_policy, env); - LOGDBG("NUMA policy used: %s", unifyfs_numa_policy); - } else { - sprintf(unifyfs_numa_policy, "default"); - } - - env = getenv("UNIFYFS_USE_NUMA_BANK"); - if (env) { - int val = atoi(env); - if (val >= 0) { - unifyfs_numa_bank = val; - } else { - LOGERR("Incorrect NUMA bank specified in UNIFYFS_USE_NUMA_BANK." - " Proceeding with default allocation policy."); - } - } -#endif - /* record the max fd for the system */ /* RLIMIT_NOFILE specifies a value one greater than the maximum * file descriptor number that can be opened by this process */ @@ -1949,490 +2369,164 @@ static int unifyfs_init(int rank) /* initialize file descriptor structures */ int num_fds = UNIFYFS_MAX_FILEDESCS; - for (i = 0; i < num_fds; i++) { - unifyfs_fd_init(i); - } - - /* initialize file stream structures */ - int num_streams = UNIFYFS_MAX_FILEDESCS; - for (i = 0; i < num_streams; i++) { - unifyfs_stream_init(i); - } - - /* initialize directory stream structures */ - int num_dirstreams = UNIFYFS_MAX_FILEDESCS; - for (i = 0; i < num_dirstreams; i++) { - unifyfs_dirstream_init(i); - } - - /* initialize stack of free fd values */ - size_t free_fd_size = unifyfs_stack_bytes(num_fds); - unifyfs_fd_stack = malloc(free_fd_size); - unifyfs_stack_init(unifyfs_fd_stack, num_fds); - - /* initialize stack of free stream values */ - size_t free_stream_size = unifyfs_stack_bytes(num_streams); - unifyfs_stream_stack = malloc(free_stream_size); - unifyfs_stack_init(unifyfs_stream_stack, num_streams); - - /* initialize stack of free directory stream values */ - size_t free_dirstream_size = unifyfs_stack_bytes(num_dirstreams); - unifyfs_dirstream_stack = malloc(free_dirstream_size); - unifyfs_stack_init(unifyfs_dirstream_stack, num_dirstreams); - - /* determine the size of the superblock */ - shm_super_size = unifyfs_superblock_size(); - - /* get a superblock of shared memory and initialize our - * global variables for this block */ - shm_super_buf = unifyfs_superblock_shmget( - shm_super_size, unifyfs_mount_shmget_key); - if (shm_super_buf == NULL) { - LOGERR("unifyfs_superblock_shmget() failed"); - return UNIFYFS_FAILURE; - } - - /* initialize spillover store */ - if (unifyfs_use_spillover) { - /* get directory in which to create spill over files */ - cfgval = client_cfg.spillover_data_dir; - if (cfgval != NULL) { - strncpy(external_data_dir, cfgval, sizeof(external_data_dir)); - } else { - LOGERR("UNIFYFS_SPILLOVER_DATA_DIR not set, must be an existing" - " writable path (e.g., /mnt/ssd):"); - return UNIFYFS_FAILURE; - } - - /* define path to the spill over file for data chunks */ - char spillfile_prefix[UNIFYFS_MAX_FILENAME]; - snprintf(spillfile_prefix, sizeof(spillfile_prefix), - "%s/spill_%d_%d.log", - external_data_dir, app_id, local_rank_idx); - - /* create the spill over file */ - unifyfs_spilloverblock = - unifyfs_get_spillblock(unifyfs_spillover_size, - spillfile_prefix); - if (unifyfs_spilloverblock < 0) { - LOGERR("unifyfs_get_spillblock() failed!"); - return UNIFYFS_FAILURE; - } - - /* get directory in which to create spill over files - * for key/value pairs */ - cfgval = client_cfg.spillover_meta_dir; - if (cfgval != NULL) { - strncpy(external_meta_dir, cfgval, sizeof(external_meta_dir)); - } else { - LOGERR("UNIFYFS_SPILLOVER_META_DIR not set, must be an existing" - " writable path (e.g., /mnt/ssd):"); - return UNIFYFS_FAILURE; - } - - /* define path to the spill over file for key/value pairs */ - snprintf(spillfile_prefix, sizeof(spillfile_prefix), - "%s/spill_index_%d_%d.log", - external_meta_dir, app_id, local_rank_idx); - - /* create the spill over file for key value data */ - unifyfs_spillmetablock = - unifyfs_get_spillblock(unifyfs_index_buf_size, - spillfile_prefix); - if (unifyfs_spillmetablock < 0) { - LOGERR("unifyfs_get_spillmetablock failed!"); - return UNIFYFS_FAILURE; - } - } - - /* remember that we've now initialized the library */ - unifyfs_initialized = 1; - } - - return UNIFYFS_SUCCESS; -} - -/* --------------------------------------- - * APIs exposed to external libraries - * --------------------------------------- */ - -/* Fill mount rpc input struct with client-side context info */ -void fill_client_mount_info(unifyfs_mount_in_t* in) -{ - size_t meta_offset = (char*)unifyfs_indices.ptr_num_entries - - (char*)shm_super_buf; - size_t meta_size = unifyfs_max_index_entries - * sizeof(unifyfs_index_t); - - size_t fmeta_offset = (char*)unifyfs_fattrs.ptr_num_entries - - (char*)shm_super_buf; - size_t fmeta_size = unifyfs_max_fattr_entries - * sizeof(unifyfs_file_attr_t); - - size_t data_offset = (char*)unifyfs_chunks - (char*)shm_super_buf; - size_t data_size = (size_t)unifyfs_max_chunks * unifyfs_chunk_size; - - in->app_id = app_id; - in->local_rank_idx = local_rank_idx; - in->dbg_rank = client_rank; - in->num_procs_per_node = local_rank_cnt; - in->req_buf_sz = shm_req_size; - in->recv_buf_sz = shm_recv_size; - in->superblock_sz = shm_super_size; - in->meta_offset = meta_offset; - in->meta_size = meta_size; - in->fmeta_offset = fmeta_offset; - in->fmeta_size = fmeta_size; - in->data_offset = data_offset; - in->data_size = data_size; - in->external_spill_dir = strdup(external_data_dir); -} - -/** - * Initialize the shared recv memory buffer to receive data from the delegators - */ -static int unifyfs_init_recv_shm(int local_rank_idx, int app_id) -{ - /* get size of shared memory region from configuration */ - char* cfgval = client_cfg.shmem_recv_size; - if (cfgval != NULL) { - long l; - int rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - shm_recv_size = l; - } - } - - /* define file name to shared memory file */ - snprintf(shm_recv_name, sizeof(shm_recv_name), - "%d-recv-%d", app_id, local_rank_idx); - - /* allocate memory for shared memory receive buffer */ - shm_recv_buf = unifyfs_shm_alloc(shm_recv_name, shm_recv_size); - if (shm_recv_buf == NULL) { - LOGERR("Failed to create buffer for read replies"); - return UNIFYFS_FAILURE; - } - - return UNIFYFS_SUCCESS; -} - -/** - * Initialize the shared request memory, which - * is used to buffer the list of read requests - * to be transferred to the delegator on the - * server side. - * @param local_rank_idx: local process id - * @param app_id: which application this - * process is from - * @return success/error code - */ -static int unifyfs_init_req_shm(int local_rank_idx, int app_id) -{ - /* get size of shared memory region from configuration */ - char* cfgval = client_cfg.shmem_req_size; - if (cfgval != NULL) { - long l; - int rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - shm_req_size = l; - } - } - - /* define name of shared memory region for request buffer */ - snprintf(shm_req_name, sizeof(shm_req_name), - "%d-req-%d", app_id, local_rank_idx); - - /* allocate memory for shared memory receive buffer */ - shm_req_buf = unifyfs_shm_alloc(shm_req_name, shm_req_size); - if (shm_req_buf == NULL) { - LOGERR("Failed to create buffer for read requests"); - return UNIFYFS_FAILURE; - } - - return UNIFYFS_SUCCESS; -} - - -#if defined(UNIFYFS_USE_DOMAIN_SOCKET) -/** - * initialize the client-side socket - * used to communicate with the server-side - * delegators. Each client is serviced by - * one delegator. - * @param proc_id: local process id - * @param l_num_procs_per_node: number - * of ranks on each compute node - * @param l_num_del_per_node: number of server-side - * delegators on the same node - * @return success/error code - */ -static int unifyfs_init_socket(int proc_id, int l_num_procs_per_node, - int l_num_del_per_node) -{ - int rc = -1; - int nprocs_per_del; - int len; - int result; - int flag; - struct sockaddr_un serv_addr; - char tmp_path[UNIFYFS_MAX_FILENAME] = {0}; - char* pmi_path = NULL; - - client_sockfd = socket(AF_UNIX, SOCK_STREAM, 0); - if (client_sockfd < 0) { - LOGERR("socket create failed"); - return -1; - } + for (i = 0; i < num_fds; i++) { + unifyfs_fd_init(i); + } - /* calculate delegator assignment */ - nprocs_per_del = l_num_procs_per_node / l_num_del_per_node; - if ((l_num_procs_per_node % l_num_del_per_node) != 0) { - nprocs_per_del++; - } - snprintf(tmp_path, sizeof(tmp_path), "%s.%d.%d", - SOCKET_PATH, getuid(), (proc_id / nprocs_per_del)); + /* initialize file stream structures */ + int num_streams = UNIFYFS_MAX_FILEDESCS; + for (i = 0; i < num_streams; i++) { + unifyfs_stream_init(i); + } - // lookup domain socket path in key-val store - if (unifyfs_keyval_lookup_local(key_unifyfsd_socket, &pmi_path) == 0) { - memset(tmp_path, 0, sizeof(tmp_path)); - snprintf(tmp_path, sizeof(tmp_path), "%s", pmi_path); - free(pmi_path); - } + /* initialize directory stream structures */ + int num_dirstreams = UNIFYFS_MAX_FILEDESCS; + for (i = 0; i < num_dirstreams; i++) { + unifyfs_dirstream_init(i); + } - memset(&serv_addr, 0, sizeof(serv_addr)); - serv_addr.sun_family = AF_UNIX; - strcpy(serv_addr.sun_path, tmp_path); - len = sizeof(serv_addr); - result = connect(client_sockfd, (struct sockaddr*)&serv_addr, len); + /* initialize stack of free fd values */ + size_t free_fd_size = unifyfs_stack_bytes(num_fds); + unifyfs_fd_stack = malloc(free_fd_size); + unifyfs_stack_init(unifyfs_fd_stack, num_fds); - /* exit with error if connection is not successful */ - if (result == -1) { - rc = -1; - LOGERR("socket connect failed"); - return rc; - } + /* initialize stack of free stream values */ + size_t free_stream_size = unifyfs_stack_bytes(num_streams); + unifyfs_stream_stack = malloc(free_stream_size); + unifyfs_stack_init(unifyfs_stream_stack, num_streams); - flag = fcntl(client_sockfd, F_GETFL); - fcntl(client_sockfd, F_SETFL, flag | O_NONBLOCK); + /* initialize stack of free directory stream values */ + size_t free_dirstream_size = unifyfs_stack_bytes(num_dirstreams); + unifyfs_dirstream_stack = malloc(free_dirstream_size); + unifyfs_stack_init(unifyfs_dirstream_stack, num_dirstreams); - cmd_fd.fd = client_sockfd; - cmd_fd.events = POLLIN | POLLHUP; - cmd_fd.revents = 0; + /* determine the size of the superblock */ + size_t shm_super_size = get_superblock_size(); - return 0; -} -#endif // UNIFYFS_USE_DOMAIN_SOCKET + /* get a superblock of shared memory and initialize our + * global variables for this block */ + rc = init_superblock_shm(shm_super_size); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to initialize superblock shmem"); + return rc; + } -int compare_fattr(const void* a, const void* b) -{ - const unifyfs_file_attr_t* ptr_a = a; - const unifyfs_file_attr_t* ptr_b = b; + /* create shared memory region for holding data for read replies */ + rc = init_recv_shm(); + if (rc < 0) { + LOGERR("failed to initialize data recv shmem"); + return UNIFYFS_FAILURE; + } - if (ptr_a->fid > ptr_b->fid) { - return 1; - } + /* initialize log-based I/O context */ + rc = unifyfs_logio_init_client(unifyfs_app_id, unifyfs_client_id, + &client_cfg, &logio_ctx); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to initialize log-based I/O (rc = %s)", + unifyfs_rc_enum_str(rc)); + return rc; + } - if (ptr_a->fid < ptr_b->fid) { - return -1; + /* remember that we've now initialized the library */ + unifyfs_initialized = 1; } - return 0; + return UNIFYFS_SUCCESS; } -static int compare_int(const void* a, const void* b) +/* free resources allocated during unifyfs_init(). + * generally, we do this in reverse order with respect to + * how things were initialized */ +static int unifyfs_finalize(void) { - const int* ptr_a = a; - const int* ptr_b = b; + int rc = UNIFYFS_SUCCESS; - if (*ptr_a - *ptr_b > 0) { - return 1; + if (!unifyfs_initialized) { + /* not initialized yet, so we shouldn't call finalize */ + return UNIFYFS_FAILURE; } - if (*ptr_a - *ptr_b < 0) { - return -1; + /* close spillover files */ + if (NULL != logio_ctx) { + unifyfs_logio_close(logio_ctx, 0); + logio_ctx = NULL; } - - return 0; -} - -static int compare_name_rank_pair(const void* a, const void* b) -{ - const name_rank_pair_t* pair_a = a; - const name_rank_pair_t* pair_b = b; - - if (strcmp(pair_a->hostname, pair_b->hostname) > 0) { - return 1; + if (unifyfs_spillmetablock != -1) { + close(unifyfs_spillmetablock); + unifyfs_spillmetablock = -1; } - if (strcmp(pair_a->hostname, pair_b->hostname) < 0) { - return -1; - } + /* detach from superblock shmem, but don't unlink the file so that + * a later client can reattach. */ + unifyfs_shm_free(&shm_super_ctx); - return 0; -} + /* unlink and detach from data receive shmem */ + unifyfs_shm_unlink(shm_recv_ctx); + unifyfs_shm_free(&shm_recv_ctx); -/** - * calculate the number of ranks per node - * - * sets global variables local_rank_cnt & local_rank_idx - * - * @param numTasks: number of tasks in the application - * @return success/error code - */ -static int CountTasksPerNode(int rank, int numTasks) -{ - char hostname[UNIFYFS_MAX_HOSTNAME]; - char localhost[UNIFYFS_MAX_HOSTNAME]; - int resultsLen = UNIFYFS_MAX_HOSTNAME; - MPI_Status status; - int i, j, rc; - int* local_rank_lst; + /* free directory stream stack */ + if (unifyfs_dirstream_stack != NULL) { + free(unifyfs_dirstream_stack); + unifyfs_dirstream_stack = NULL; + } - if (numTasks <= 0) { - LOGERR("invalid number of tasks"); - return -1; + /* free file stream stack */ + if (unifyfs_stream_stack != NULL) { + free(unifyfs_stream_stack); + unifyfs_stream_stack = NULL; } - rc = MPI_Get_processor_name(localhost, &resultsLen); - if (rc != 0) { - LOGERR("failed to get the processor's name"); + /* free file descriptor stack */ + if (unifyfs_fd_stack != NULL) { + free(unifyfs_fd_stack); + unifyfs_fd_stack = NULL; } - if (rank == 0) { - /* a container of (rank, host) mappings*/ - name_rank_pair_t* host_set = - (name_rank_pair_t*)calloc(numTasks, - sizeof(name_rank_pair_t)); + /* no longer initialized, so update the flag */ + unifyfs_initialized = 0; - strcpy(host_set[0].hostname, localhost); - host_set[0].rank = 0; + return rc; +} - /* - * MPI_Recv all hostnames, and compare to local hostname - */ - for (i = 1; i < numTasks; i++) { - rc = MPI_Recv(hostname, UNIFYFS_MAX_HOSTNAME, - MPI_CHAR, MPI_ANY_SOURCE, - MPI_ANY_TAG, MPI_COMM_WORLD, - &status); - if (rc != 0) { - LOGERR("cannot receive hostnames"); - return -1; - } - strcpy(host_set[i].hostname, hostname); - host_set[i].rank = status.MPI_SOURCE; - } - /* sort by hostname */ - qsort(host_set, numTasks, sizeof(name_rank_pair_t), - compare_name_rank_pair); +/* --------------- + * external APIs + * --------------- */ - /* - * rank_cnt: records the number of processes on each node - * rank_set: the list of ranks for each node - */ - int** rank_set = (int**)calloc(numTasks, sizeof(int*)); - int* rank_cnt = (int*)calloc(numTasks, sizeof(int)); - int cursor = 0; - int set_counter = 0; - - for (i = 1; i < numTasks; i++) { - if (strcmp(host_set[i].hostname, - host_set[i - 1].hostname) != 0) { - // found a different host, so switch to a new set - rank_set[set_counter] = - (int*)calloc((i - cursor), sizeof(int)); - rank_cnt[set_counter] = i - cursor; - int hiter, riter = 0; - for (hiter = cursor; hiter < i; hiter++, riter++) { - rank_set[set_counter][riter] = host_set[hiter].rank; - } +/* Fill mount rpc input struct with client-side context info */ +void fill_client_mount_info(unifyfs_mount_in_t* in) +{ + in->dbg_rank = client_rank; + in->mount_prefix = strdup(client_cfg.unifyfs_mountpoint); +} - set_counter++; - cursor = i; - } - } +/* Fill attach rpc input struct with client-side context info */ +void fill_client_attach_info(unifyfs_attach_in_t* in) +{ + size_t meta_offset = (char*)unifyfs_indices.ptr_num_entries - + (char*)shm_super_ctx->addr; + size_t meta_size = unifyfs_max_index_entries + * sizeof(unifyfs_index_t); - /* fill rank_cnt and rank_set entry for the last node */ - rank_set[set_counter] = (int*)calloc((i - cursor), sizeof(int)); - rank_cnt[set_counter] = numTasks - cursor; - j = 0; - for (i = cursor; i < numTasks; i++, j++) { - rank_set[set_counter][j] = host_set[i].rank; - } - set_counter++; - - /* broadcast the rank_cnt and rank_set information to each rank */ - int root_set_no = -1; - for (i = 0; i < set_counter; i++) { - /* send each rank set to all of its ranks */ - for (j = 0; j < rank_cnt[i]; j++) { - if (rank_set[i][j] != 0) { - rc = MPI_Send(&rank_cnt[i], 1, MPI_INT, rank_set[i][j], - 0, MPI_COMM_WORLD); - if (rc != 0) { - LOGERR("cannot send local rank cnt"); - return -1; - } - rc = MPI_Send(rank_set[i], rank_cnt[i], MPI_INT, - rank_set[i][j], 0, MPI_COMM_WORLD); - if (rc != 0) { - LOGERR("cannot send local rank list"); - return -1; - } - } else { - root_set_no = i; - local_rank_cnt = rank_cnt[i]; - local_rank_lst = (int*)calloc(rank_cnt[i], sizeof(int)); - memcpy(local_rank_lst, rank_set[i], - (local_rank_cnt * sizeof(int))); - } - } - } + in->app_id = unifyfs_app_id; + in->client_id = unifyfs_client_id; + in->shmem_data_size = shm_recv_ctx->size; + in->shmem_super_size = shm_super_ctx->size; + in->meta_offset = meta_offset; + in->meta_size = meta_size; - for (i = 0; i < set_counter; i++) { - free(rank_set[i]); - } - free(rank_cnt); - free(host_set); - free(rank_set); + if (NULL != logio_ctx->shmem) { + in->logio_mem_size = logio_ctx->shmem->size; } else { - /* non-root process - MPI_Send hostname to root node */ - rc = MPI_Send(localhost, UNIFYFS_MAX_HOSTNAME, MPI_CHAR, - 0, 0, MPI_COMM_WORLD); - if (rc != 0) { - LOGERR("cannot send host name"); - return -1; - } - /* receive the local rank set count */ - rc = MPI_Recv(&local_rank_cnt, 1, MPI_INT, - 0, 0, MPI_COMM_WORLD, &status); - if (rc != 0) { - LOGERR("cannot receive local rank cnt"); - return -1; - } - /* receive the the local rank set */ - local_rank_lst = (int*)calloc(local_rank_cnt, sizeof(int)); - rc = MPI_Recv(local_rank_lst, local_rank_cnt, MPI_INT, - 0, 0, MPI_COMM_WORLD, &status); - if (rc != 0) { - free(local_rank_lst); - LOGERR("cannot receive local rank list"); - return -1; - } + in->logio_mem_size = 0; } - /* sort local ranks by rank */ - qsort(local_rank_lst, local_rank_cnt, sizeof(int), compare_int); - for (i = 0; i < local_rank_cnt; i++) { - if (local_rank_lst[i] == rank) { - local_rank_idx = i; - break; - } + in->logio_spill_size = logio_ctx->spill_sz; + if (logio_ctx->spill_sz) { + in->logio_spill_dir = strdup(client_cfg.logio_spill_dir); + } else { + in->logio_spill_dir = NULL; } - free(local_rank_lst); - return 0; } /** @@ -2449,8 +2543,6 @@ int unifyfs_mount(const char prefix[], int rank, size_t size, { int rc; int kv_rank, kv_nranks; - bool b; - char* cfgval; if (-1 != unifyfs_mounted) { if (l_app_id != unifyfs_mounted) { @@ -2462,19 +2554,13 @@ int unifyfs_mount(const char prefix[], int rank, size_t size, } } - /* record our rank for debugging messages, - * record the value we should use for an app_id */ - app_id = l_app_id; + // record our rank for debugging messages client_rank = rank; global_rank_cnt = (int)size; - /* print log messages to stderr */ + // print log messages to stderr unifyfs_log_open(NULL); - /************************ - * read configuration values - ************************/ - // initialize configuration rc = unifyfs_config_init(&client_cfg, 0, NULL); if (rc) { @@ -2483,11 +2569,26 @@ int unifyfs_mount(const char prefix[], int rank, size_t size, } client_cfg.ptype = UNIFYFS_CLIENT; - // update configuration from runstate file - rc = unifyfs_read_runstate(&client_cfg, NULL); - if (rc) { - LOGERR("failed to update configuration from runstate."); - return UNIFYFS_FAILURE; + // set log level from config + char* cfgval = client_cfg.log_verbosity; + if (cfgval != NULL) { + long l; + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + unifyfs_set_log_level((unifyfs_log_level_t)l); + } + } + + // record mountpoint prefix string + unifyfs_mount_prefix = strdup(prefix); + unifyfs_mount_prefixlen = strlen(unifyfs_mount_prefix); + client_cfg.unifyfs_mountpoint = unifyfs_mount_prefix; + + // generate app_id from mountpoint prefix + unifyfs_app_id = unifyfs_generate_gfid(unifyfs_mount_prefix); + if (l_app_id != 0) { + LOGDBG("ignoring passed app_id=%d, using mountpoint app_id=%d", + l_app_id, unifyfs_app_id); } // initialize k-v store access @@ -2495,94 +2596,54 @@ int unifyfs_mount(const char prefix[], int rank, size_t size, kv_nranks = size; rc = unifyfs_keyval_init(&client_cfg, &kv_rank, &kv_nranks); if (rc) { - LOGERR("failed to update configuration from runstate."); + LOGERR("failed to initialize kvstore"); return UNIFYFS_FAILURE; } if ((client_rank != kv_rank) || (size != kv_nranks)) { LOGDBG("mismatch on mount vs kvstore rank/size"); } - /************************ - * record our mount point, and initialize structures to - * store data - ************************/ - - /* record a copy of the prefix string defining the mount point - * we should intercept */ - unifyfs_mount_prefix = strdup(prefix); - unifyfs_mount_prefixlen = strlen(unifyfs_mount_prefix); - - /* - * unifyfs_mount_shmget_key marks the start of - * the superblock shared memory of each rank - * each process has three types of shared memory: - * request memory, recv memory and superblock - * memory. We set unifyfs_mount_shmget_key in - * this way to avoid different ranks conflicting - * on the same name in shm_open. - */ - cfgval = client_cfg.shmem_single; - if (cfgval != NULL) { - rc = configurator_bool_val(cfgval, &b); - if ((rc == 0) && b) { - unifyfs_use_single_shm = 1; - } - } - - /* compute our local rank on the node, - * the following call initializes local_rank_{cnt,ndx} */ - rc = CountTasksPerNode(rank, size); - if (rc < 0) { - LOGERR("cannot get the local rank list."); - return -1; - } - - /* use our local rank on the node in shared memory and file - * names to avoid conflicting with other procs on our node */ - unifyfs_mount_shmget_key = local_rank_idx; - - /* initialize our library, creates superblock and spillover files */ - int ret = unifyfs_init(rank); - if (ret != UNIFYFS_SUCCESS) { - return ret; - } - /* open rpc connection to server */ - ret = unifyfs_client_rpc_init(); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("Failed to initialize client RPC"); - return ret; + rc = unifyfs_client_rpc_init(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to initialize client RPC"); + return rc; } - /* call client mount rpc function here - * to register our shared memory and files with server */ - LOGDBG("calling mount"); - invoke_client_mount_rpc(); - -#if defined(UNIFYFS_USE_DOMAIN_SOCKET) - /* open a socket to the server */ - rc = unifyfs_init_socket(local_rank_idx, local_rank_cnt, - local_del_cnt); - if (rc < 0) { - LOGERR("failed to initialize socket, rc == %d", rc); - return UNIFYFS_FAILURE; + /* Call client mount rpc function to get client id */ + LOGDBG("calling mount rpc"); + rc = invoke_client_mount_rpc(); + if (rc != UNIFYFS_SUCCESS) { + /* If we fail to connect to the server, bail with an error */ + LOGERR("failed to mount to server"); + return rc; } -#endif - /* create shared memory region for read requests */ - rc = unifyfs_init_req_shm(local_rank_idx, app_id); - if (rc < 0) { - LOGERR("failed to init shared request memory"); - return UNIFYFS_FAILURE; + /* initialize our library using assigned client id, creates shared memory + * regions (e.g., superblock and data recv) and inits log-based I/O */ + rc = unifyfs_init(); + if (rc != UNIFYFS_SUCCESS) { + return rc; } - /* create shared memory region for holding data for read replies */ - rc = unifyfs_init_recv_shm(local_rank_idx, app_id); - if (rc < 0) { - LOGERR("failed to init shared receive memory"); - return UNIFYFS_FAILURE; + /* Call client attach rpc function to register our newly created shared + * memory and files with server */ + LOGDBG("calling attach rpc"); + rc = invoke_client_attach_rpc(); + if (rc != UNIFYFS_SUCCESS) { + /* If we fail, bail with an error */ + LOGERR("failed to attach to server"); + unifyfs_finalize(); + return rc; } + /* Once we return from attach, we know the server has attached to our + * shared memory region for read replies, so we can safely remove the + * file. The memory region will stay active until both client and server + * unmap them. We keep the superblock file around so that a future client + * can reattach to it. */ + unifyfs_shm_unlink(shm_recv_ctx); + /* add mount point as a new directory in the file list */ if (unifyfs_get_fid_from_path(prefix) < 0) { /* no entry exists for mount point, so create one */ @@ -2591,66 +2652,17 @@ int unifyfs_mount(const char prefix[], int rank, size_t size, /* if there was an error, return it */ LOGERR("failed to create directory entry for mount point: `%s'", prefix); + unifyfs_finalize(); return UNIFYFS_FAILURE; } } /* record client state as mounted for specific app_id */ - unifyfs_mounted = app_id; + unifyfs_mounted = unifyfs_app_id; return UNIFYFS_SUCCESS; } -/* free resources allocated during unifyfs_init, - * generally we do this in reverse order that - * things were initailized in */ -static int unifyfs_finalize(void) -{ - int rc = UNIFYFS_SUCCESS; - - if (!unifyfs_initialized) { - /* not initialized yet, so we shouldn't call finalize */ - return UNIFYFS_FAILURE; - } - - /* close spillover files */ - if (unifyfs_spilloverblock != 0) { - close(unifyfs_spilloverblock); - unifyfs_spilloverblock = 0; - } - - if (unifyfs_spillmetablock != 0) { - close(unifyfs_spillmetablock); - unifyfs_spillmetablock = 0; - } - - /* detach from superblock */ - unifyfs_shm_free(shm_super_name, shm_super_size, &shm_super_buf); - - /* free directory stream stack */ - if (unifyfs_dirstream_stack != NULL) { - free(unifyfs_dirstream_stack); - unifyfs_dirstream_stack = NULL; - } - - /* free file stream stack */ - if (unifyfs_stream_stack != NULL) { - free(unifyfs_stream_stack); - unifyfs_stream_stack = NULL; - } - - /* free file descriptor stack */ - if (unifyfs_fd_stack != NULL) { - free(unifyfs_fd_stack); - unifyfs_fd_stack = NULL; - } - - /* no longer initialized, so update the flag */ - unifyfs_initialized = 0; - - return rc; -} - /** * unmount the mounted file system * TODO: Add support for unmounting more than @@ -2666,24 +2678,18 @@ int unifyfs_unmount(void) return UNIFYFS_SUCCESS; } + /* sync any outstanding writes */ + LOGDBG("syncing data"); + rc = unifyfs_sync(-1); + if (rc) { + LOGERR("client sync failed"); + ret = UNIFYFS_FAILURE; + } + /************************ * tear down connection to server ************************/ - /* detach from shared memory regions */ - unifyfs_shm_free(shm_req_name, shm_req_size, &shm_req_buf); - unifyfs_shm_free(shm_recv_name, shm_recv_size, &shm_recv_buf); - - /* close socket to server */ - if (client_sockfd >= 0) { - errno = 0; - rc = close(client_sockfd); - if (rc != 0) { - LOGERR("Failed to close() socket to server errno=%d (%s)", - errno, strerror(errno)); - } - } - /* invoke unmount rpc to tell server we're disconnecting */ LOGDBG("calling unmount"); rc = invoke_client_unmount_rpc(); @@ -2708,12 +2714,18 @@ int unifyfs_unmount(void) free(unifyfs_mount_prefix); unifyfs_mount_prefix = NULL; unifyfs_mount_prefixlen = 0; + client_cfg.unifyfs_mountpoint = NULL; } /************************ * free configuration values ************************/ + /* free global holding current working directory */ + if (unifyfs_cwd != NULL) { + free(unifyfs_cwd); + } + /* clean up configuration */ rc = unifyfs_config_fini(&client_cfg); if (rc) { @@ -2729,7 +2741,7 @@ int unifyfs_unmount(void) return ret; } -#define UNIFYFS_TX_BUFSIZE (64*(1<<10)) +#define UNIFYFS_TX_BUFSIZE (8*(1<<20)) enum { UNIFYFS_TX_STAGE_OUT = 0, @@ -2747,19 +2759,25 @@ ssize_t do_transfer_data(int fd_src, int fd_dst, off_t offset, size_t count) ssize_t n_left = 0; ssize_t n_processed = 0; size_t len = UNIFYFS_TX_BUFSIZE; - char buf[UNIFYFS_TX_BUFSIZE] = { 0, }; + char* buf = NULL; + + buf = malloc(UNIFYFS_TX_BUFSIZE); + if (!buf) { + LOGERR("failed to allocate transfer buffer"); + return ENOMEM; + } pos = lseek(fd_src, offset, SEEK_SET); if (pos == (off_t) -1) { LOGERR("lseek failed (%d: %s)\n", errno, strerror(errno)); - ret = -1; + ret = errno; goto out; } pos = lseek(fd_dst, offset, SEEK_SET); if (pos == (off_t) -1) { LOGERR("lseek failed (%d: %s)\n", errno, strerror(errno)); - ret = -1; + ret = errno; goto out; } @@ -2794,6 +2812,11 @@ ssize_t do_transfer_data(int fd_src, int fd_dst, off_t offset, size_t count) } out: + if (buf) { + free(buf); + buf = NULL; + } + return ret; } @@ -2803,7 +2826,6 @@ static int do_transfer_file_serial(const char* src, const char* dst, int ret = 0; int fd_src = 0; int fd_dst = 0; - char buf[UNIFYFS_TX_BUFSIZE] = { 0, }; /* * for now, we do not use the @dir hint. @@ -2820,6 +2842,9 @@ static int do_transfer_file_serial(const char* src, const char* dst, goto out_close_src; } + LOGDBG("serial transfer (%d/%d): offset=0, length=%lu", + client_rank, global_rank_cnt, (unsigned long) sb_src->st_size); + ret = do_transfer_data(fd_src, fd_dst, 0, sb_src->st_size); if (ret < 0) { LOGERR("do_transfer_data failed!"); @@ -2850,15 +2875,10 @@ static int do_transfer_file_parallel(const char* src, const char* dst, fd_src = open(src, O_RDONLY); if (fd_src < 0) { + LOGERR("failed to open file %s", src); return errno; } - fd_dst = open(dst, O_CREAT | O_WRONLY | O_TRUNC, 0644); - if (fd_dst < 0) { - ret = errno; - goto out_close_src; - } - /* * if the file is smaller than (rankcount*buffersize), just do with the * serial mode. @@ -2896,18 +2916,35 @@ static int do_transfer_file_parallel(const char* src, const char* dst, if (client_rank == (global_rank_cnt - 1)) { len = (n_chunks - 1) * UNIFYFS_TX_BUFSIZE; - len += size % UNIFYFS_TX_BUFSIZE; + remainder = size % UNIFYFS_TX_BUFSIZE; + len += (remainder > 0 ? remainder : UNIFYFS_TX_BUFSIZE); } else { len = n_chunks * UNIFYFS_TX_BUFSIZE; } - LOGDBG("parallel transfer (%d/%d): offset=%lu, length=%lu", - client_rank, global_rank_cnt, - (unsigned long) offset, (unsigned long) len); + if (len > 0) { + LOGDBG("parallel transfer (%d/%d): " + "nchunks=%lu, offset=%lu, length=%lu", + client_rank, global_rank_cnt, + n_chunks, (unsigned long) offset, (unsigned long) len); - ret = do_transfer_data(fd_src, fd_dst, offset, len); + fd_dst = open(dst, O_WRONLY); + if (fd_dst < 0) { + LOGERR("failed to open file %s", dst); + ret = errno; + goto out_close_src; + } + + ret = do_transfer_data(fd_src, fd_dst, offset, len); + if (ret) { + LOGERR("failed to transfer data (ret=%d, %s)", ret, strerror(ret)); + } else { + fsync(fd_dst); + } + + close(fd_dst); + } - close(fd_dst); out_close_src: close(fd_src); @@ -2919,6 +2956,7 @@ int unifyfs_transfer_file(const char* src, const char* dst, int parallel) int ret = 0; int dir = 0; struct stat sb_src = { 0, }; + mode_t source_file_mode_write_removed; struct stat sb_dst = { 0, }; int unify_src = 0; int unify_dst = 0; @@ -2926,11 +2964,14 @@ int unifyfs_transfer_file(const char* src, const char* dst, int parallel) char* pos = dst_path; char* src_path = strdup(src); + int local_return_val; + if (!src_path) { return -ENOMEM; } - if (unifyfs_intercept_path(src)) { + char src_upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(src, src_upath)) { dir = UNIFYFS_TX_STAGE_OUT; unify_src = 1; } @@ -2942,7 +2983,8 @@ int unifyfs_transfer_file(const char* src, const char* dst, int parallel) pos += sprintf(pos, "%s", dst); - if (unifyfs_intercept_path(dst)) { + char dst_upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(dst, dst_upath)) { dir = UNIFYFS_TX_STAGE_IN; unify_dst = 1; } @@ -2957,13 +2999,31 @@ int unifyfs_transfer_file(const char* src, const char* dst, int parallel) } if (unify_src + unify_dst != 1) { - return -EINVAL; + // we may fail the operation with EINVAL, but useful for testing + LOGDBG("WARNING: none of pathnames points to unifyfs volume"); } if (parallel) { - return do_transfer_file_parallel(src_path, dst_path, &sb_src, dir); + local_return_val = + do_transfer_file_parallel(src_path, dst_path, &sb_src, dir); } else { - return do_transfer_file_serial(src_path, dst_path, &sb_src, dir); - } + local_return_val = + do_transfer_file_serial(src_path, dst_path, &sb_src, dir); + } + + // We know here that one (but not both) of the constituent files + // is in the unify FS. We just have to decide if the *destination* file is. + // If it is, then now that we've transferred it, we'll set it to be readable + // so that it will be laminated and will be readable by other processes. + if (unify_dst) { + // pull the source file's mode bits, remove all the write bits but leave + // the rest intact and store that new mode. Now that the file has been + // copied into the unify file system, chmod the file to the new + // permission. When unify senses all the write bits are removed it will + // laminate the file. + source_file_mode_write_removed = + (sb_src.st_mode) & ~(0222); + chmod(dst_path, source_file_mode_write_removed); + } + return local_return_val; } - diff --git a/client/src/unifyfs.h b/client/src/unifyfs.h index a2a11b8d1..045943891 100644 --- a/client/src/unifyfs.h +++ b/client/src/unifyfs.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -12,68 +12,19 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -/* - * Copyright (c) 2013, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * code Written by - * Raghunath Rajachandrasekar - * Kathryn Mohror - * Adam Moody - * All rights reserved. - * This file is part of CRUISE. - * For details, see https://github.com/hpc/cruise - * Please also read this file LICENSE.CRUISE - */ - #ifndef UNIFYFS_H #define UNIFYFS_H -#include -#include // size_t -#include // off_t - -#include "unifyfs_const.h" +#include #ifdef __cplusplus extern "C" { #endif -/* linked list of chunk information given to an external library wanting - * to RDMA out a file from UNIFYFS */ -typedef struct { - off_t chunk_id; - int location; - void* chunk_mr; - off_t spillover_offset; - struct chunk_list_t* next; -} chunk_list_t; - -/*data structures defined for unifyfs********************/ - -typedef struct { - char hostname[UNIFYFS_MAX_HOSTNAME]; - int rank; -} name_rank_pair_t; int unifyfs_mount(const char prefix[], int rank, size_t size, int l_app_id); int unifyfs_unmount(void); -int compare_fattr(const void* a, const void* b); /** * @brief transfer a single file between unifyfs and other file system. either diff --git a/client/src/unifyfsf.c b/client/src/unifyfsf.c index ffc39d8f5..07e405ff7 100644 --- a/client/src/unifyfsf.c +++ b/client/src/unifyfsf.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017-2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -94,6 +94,11 @@ static int unifyfs_fstr2cstr(const char* fstr, int flen, char* cstr, int clen) return rc; } +/* + * Marking unifyfs_cstr2fstr() with an '#if 0' block, since this function + * isn't used yet. + */ +#if 0 /* convert a C string to a Fortran string, adding trailing spaces * as necessary */ static int unifyfs_cstr2fstr(const char* cstr, char* fstr, int flen) @@ -128,6 +133,7 @@ static int unifyfs_cstr2fstr(const char* cstr, char* fstr, int flen) return rc; } +#endif /*================================================ * Mount, Unmount diff --git a/client/src/unifyfsf.h b/client/src/unifyfsf.h index 645fb2ce7..364cf437f 100644 --- a/client/src/unifyfsf.h +++ b/client/src/unifyfsf.h @@ -1,7 +1,7 @@ -! Copyright (c) 2017, Lawrence Livermore National Security, LLC. +! Copyright (c) 2020, Lawrence Livermore National Security, LLC. ! Produced at the Lawrence Livermore National Laboratory. ! -! Copyright 2017-2019, UT-Battelle, LLC. +! Copyright 2020, UT-Battelle, LLC. ! ! LLNL-CODE-741539 ! All rights reserved. diff --git a/common/src/LICENSE.tree_dot_h b/common/src/LICENSE.tree_dot_h new file mode 100644 index 000000000..ae4b4fe2f --- /dev/null +++ b/common/src/LICENSE.tree_dot_h @@ -0,0 +1,22 @@ +Copyright 2002 Niels Provos +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/common/src/Makefile.am b/common/src/Makefile.am index 00dc6ae0a..7d2588224 100644 --- a/common/src/Makefile.am +++ b/common/src/Makefile.am @@ -1,24 +1,25 @@ lib_LTLIBRARIES = libunifyfs_common.la -include_HEADERS = unifyfs_const.h err_enumerator.h +include_HEADERS = unifyfs_const.h unifyfs_rc.h libunifyfs_commondir = $(includedir) BASE_SRCS = \ - ini.h \ - ini.c \ - err_enumerator.h \ - err_enumerator.c \ + arraylist.h \ + arraylist.c \ cm_enumerator.h \ cm_enumerator.c \ + ini.h \ + ini.c \ rm_enumerator.h \ rm_enumerator.c \ - flatbuffers_common_builder.h \ - flatbuffers_common_reader.h \ - ucr_read_builder.h \ - ucr_read_reader.h \ + seg_tree.h \ + seg_tree.c \ + slotmap.h \ + slotmap.c \ tinyexpr.h \ tinyexpr.c \ + tree.h \ unifyfs_const.h \ unifyfs_configurator.h \ unifyfs_configurator.c \ @@ -26,15 +27,23 @@ BASE_SRCS = \ unifyfs_keyval.c \ unifyfs_log.h \ unifyfs_log.c \ + unifyfs_logio.h \ + unifyfs_logio.c \ unifyfs_meta.h \ + unifyfs_meta.c \ + unifyfs_misc.c \ + unifyfs_misc.h \ unifyfs_rpc_util.h \ unifyfs_rpc_util.c \ + unifyfs_rpc_types.h \ unifyfs_client_rpcs.h \ unifyfs_server_rpcs.h \ - unifyfs_runstate.h \ - unifyfs_runstate.c \ + unifyfs_rc.h \ + unifyfs_rc.c \ unifyfs_shm.h \ - unifyfs_shm.c + unifyfs_shm.c \ + unifyfs-stack.h \ + unifyfs-stack.c OPT_FLAGS = OPT_LIBS = @@ -56,13 +65,14 @@ libunifyfs_common_la_CPPFLAGS = \ $(OPT_FLAGS) \ $(MERCURY_CFLAGS) \ $(ARGOBOTS_CFLAGS) \ - $(MARGO_CFLAGS) \ - $(FLATCC_CFLAGS) + $(MARGO_CFLAGS) + +libunifyfs_common_la_CPPFLAGS += -DSYSCONFDIR="$(sysconfdir)" libunifyfs_common_la_LDFLAGS = \ -version-info $(LIBUNIFYFS_LT_VERSION) libunifyfs_common_la_LIBADD = \ - $(OPT_LIBS) -lm -lrt + $(OPT_LIBS) -lm -lrt -lcrypto -lpthread -AM_CFLAGS = -Wall -Wno-strict-aliasing +AM_CFLAGS = -Wall -Werror -Wno-strict-aliasing diff --git a/server/src/arraylist.c b/common/src/arraylist.c similarity index 97% rename from server/src/arraylist.c rename to common/src/arraylist.c index 68b057676..f1808d879 100644 --- a/server/src/arraylist.c +++ b/common/src/arraylist.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/server/src/arraylist.h b/common/src/arraylist.h similarity index 93% rename from server/src/arraylist.h rename to common/src/arraylist.h index b33737614..8bd3d05ad 100644 --- a/server/src/arraylist.h +++ b/common/src/arraylist.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/cm_enumerator.c b/common/src/cm_enumerator.c index 23e46c82e..6cb9c9f3c 100644 --- a/common/src/cm_enumerator.c +++ b/common/src/cm_enumerator.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/cm_enumerator.h b/common/src/cm_enumerator.h index 50b611b73..2f1744cfa 100644 --- a/common/src/cm_enumerator.h +++ b/common/src/cm_enumerator.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/err_enumerator.h b/common/src/err_enumerator.h deleted file mode 100644 index 021262ebd..000000000 --- a/common/src/err_enumerator.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2019, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* Copyright (c) 2018 - Michael J. Brim - * - * Enumerator is part of https://github.com/MichaelBrim/tedium - * - * MIT License - See LICENSE.tedium - */ - -#ifndef _UNIFYFS_ERROR_ENUMERATOR_H_ -#define _UNIFYFS_ERROR_ENUMERATOR_H_ -#include - -/** - * @brief enumerator list expanded many times with varied ENUMITEM() definitions - * - * @param item name - * @param item short description - */ -#define UNIFYFS_ERROR_ENUMERATOR \ - ENUMITEM(ACCEPT, "Failed to accept RDMA connection.") \ - ENUMITEM(ADDR, "Failed to parse IP address and port.") \ - ENUMITEM(APPCONFIG, "Failed to initialize application config.") \ - ENUMITEM(ARRAY_BOUNDS, "Array access out of bounds.") \ - ENUMITEM(BADF, "Bad file descriptor.") \ - ENUMITEM(CHANNEL, "Error creating completion channel.") \ - ENUMITEM(CONNECT, "Error in RDMA connect or disconnect.") \ - ENUMITEM(CONTEXT, "Wrong connection context.") \ - ENUMITEM(CQ, "Error creating or polling completion queue.") \ - ENUMITEM(DBG, "Failed to open/close debug file.") \ - ENUMITEM(EVENT_UNKNOWN, "Unknown event detected.") \ - ENUMITEM(EXIST, "File or directory exists.") \ - ENUMITEM(EXIT, "Error - remote peer exited.") \ - ENUMITEM(FAILURE, "General failure.") \ - ENUMITEM(FBIG, "File too large.") \ - ENUMITEM(FILE, "File operation error.") \ - ENUMITEM(GENERAL, "General system call error.") \ - ENUMITEM(INVAL, "Invalid argument.") \ - ENUMITEM(IO, "Generic I/O error.") \ - ENUMITEM(ISDIR, "Invalid operation for directory.") \ - ENUMITEM(MARGO, "Mercury/Argobots operation error.") \ - ENUMITEM(MDHIM, "MDHIM operation error.") \ - ENUMITEM(MDINIT, "MDHIM initialization error.") \ - ENUMITEM(NAMETOOLONG, "Filename is too long.") \ - ENUMITEM(NFILE, "Too many open files.") \ - ENUMITEM(NOENT, "No such file or directory.") \ - ENUMITEM(NOENV, "Environment variable is not defined.") \ - ENUMITEM(NOMEM, "Error in memory allocation/free.") \ - ENUMITEM(NOSPC, "No space left on device.") \ - ENUMITEM(NOTDIR, "Not a directory.") \ - ENUMITEM(OVERFLOW, "Value too large for data type.") \ - ENUMITEM(PD, "Error creating PD.") \ - ENUMITEM(PIPE, "Pipe error.") \ - ENUMITEM(PMIX, "PMIx error.") \ - ENUMITEM(POLL, "Error on poll.") \ - ENUMITEM(POSTRECV, "Failed to post receive operation.") \ - ENUMITEM(POSTSEND, "Failed to post send operation.") \ - ENUMITEM(QP, "Error creating or destroying QP.") \ - ENUMITEM(READ, "Read error.") \ - ENUMITEM(RECV, "Receive error.") \ - ENUMITEM(REGMEM, "Memory [de]registration failure.") \ - ENUMITEM(RM_INIT, "Failed to init request manager.") \ - ENUMITEM(RM_RECV, "Fail to receive data in request manager.") \ - ENUMITEM(ROUTE, "Failed to resolve route.") \ - ENUMITEM(SEND, "Send error.") \ - ENUMITEM(SHMEM, "Error on shared memory attach.") \ - ENUMITEM(SOCKET, "Error creating/open socket.") \ - ENUMITEM(SOCKET_FD_EXCEED, "Exceeded max number of connections.") \ - ENUMITEM(SOCK_CMD, "Unknown exception on the remote peer.") \ - ENUMITEM(SOCK_DISCONNECT, "Remote peer disconnected.") \ - ENUMITEM(SOCK_LISTEN, "Exception on listening socket.") \ - ENUMITEM(SOCK_OTHER, "Unknown socket error.") \ - ENUMITEM(THRDINIT, "Thread initialization failure.") \ - ENUMITEM(TIMEOUT, "Error - timed out.") \ - ENUMITEM(WC, "Write completion with error.") \ - ENUMITEM(WRITE, "Write error.") \ - - -#ifdef __cplusplus -extern "C" { -#endif - -/* #define __ELASTERROR if our errno.h doesn't define it for us */ -#ifndef __ELASTERROR -#define __ELASTERROR 2000 -#endif - -/** - * @brief enum for error codes - */ -typedef enum { - UNIFYFS_INVALID_ERROR = -2, - UNIFYFS_FAILURE = -1, - UNIFYFS_SUCCESS = 0, - /* Start our error numbers after the standard errno.h ones */ - UNIFRFS_START_OF_ERRORS = __ELASTERROR, -#define ENUMITEM(name, desc) \ - UNIFYFS_ERROR_ ## name, - UNIFYFS_ERROR_ENUMERATOR -#undef ENUMITEM - UNIFYFS_ERROR_MAX -} unifyfs_error_e; - -/** - * @brief get C-string for given error enum value - */ -const char *unifyfs_error_enum_str(unifyfs_error_e e); - -/** - * @brief get description for given error enum value - */ -const char *unifyfs_error_enum_description(unifyfs_error_e e); - -/** - * @brief check validity of given error enum value - */ -int check_valid_unifyfs_error_enum(unifyfs_error_e e); - -/** - * @brief get enum value for given error C-string - */ -unifyfs_error_e unifyfs_error_enum_from_str(const char *s); - -#ifdef __cplusplus -} /* extern C */ -#endif - -#endif /* UNIFYFS_ERROR_ENUMERATOR_H */ diff --git a/common/src/flatbuffers_common_builder.h b/common/src/flatbuffers_common_builder.h deleted file mode 100644 index 07d840f01..000000000 --- a/common/src/flatbuffers_common_builder.h +++ /dev/null @@ -1,658 +0,0 @@ -#ifndef FLATBUFFERS_COMMON_BUILDER_H -#define FLATBUFFERS_COMMON_BUILDER_H - -/* Generated by flatcc 0.5.3-pre FlatBuffers schema compiler for C by dvide.com */ - -/* Common FlatBuffers build functionality for C. */ - -#include "flatcc/flatcc_prologue.h" -#ifndef FLATBUILDER_H -#include "flatcc/flatcc_builder.h" -#endif -typedef flatcc_builder_t flatbuffers_builder_t; -typedef flatcc_builder_ref_t flatbuffers_ref_t; -typedef flatcc_builder_ref_t flatbuffers_vec_ref_t; -typedef flatcc_builder_union_ref_t flatbuffers_union_ref_t; -typedef flatcc_builder_union_vec_ref_t flatbuffers_union_vec_ref_t; -/* integer return code (ref and ptr always fail on 0) */ -#define flatbuffers_failed(x) ((x) < 0) -typedef flatbuffers_ref_t flatbuffers_root_t; -#define flatbuffers_root(ref) ((flatbuffers_root_t)(ref)) - -#define __flatbuffers_memoize_begin(B, src)\ -do { flatcc_builder_ref_t _ref; if ((_ref = flatcc_builder_refmap_find((B), (src)))) return _ref; } while (0) -#define __flatbuffers_memoize_end(B, src, op) do { return flatcc_builder_refmap_insert((B), (src), (op)); } while (0) -#define __flatbuffers_memoize(B, src, op) do { __flatbuffers_memoize_begin(B, src); __flatbuffers_memoize_end(B, src, op); } while (0) - -#define __flatbuffers_build_buffer(NS)\ -typedef NS ## ref_t NS ## buffer_ref_t;\ -static inline int NS ## buffer_start(NS ## builder_t *B, const NS ##fid_t fid)\ -{ return flatcc_builder_start_buffer(B, fid, 0, 0); }\ -static inline int NS ## buffer_start_with_size(NS ## builder_t *B, const NS ##fid_t fid)\ -{ return flatcc_builder_start_buffer(B, fid, 0, flatcc_builder_with_size); }\ -static inline int NS ## buffer_start_aligned(NS ## builder_t *B, NS ##fid_t fid, uint16_t block_align)\ -{ return flatcc_builder_start_buffer(B, fid, block_align, 0); }\ -static inline int NS ## buffer_start_aligned_with_size(NS ## builder_t *B, NS ##fid_t fid, uint16_t block_align)\ -{ return flatcc_builder_start_buffer(B, fid, block_align, flatcc_builder_with_size); }\ -static inline NS ## buffer_ref_t NS ## buffer_end(NS ## builder_t *B, NS ## ref_t root)\ -{ return flatcc_builder_end_buffer(B, root); } - -#define __flatbuffers_build_table_root(NS, N, FID, TFID)\ -static inline int N ## _start_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, FID) ? -1 : N ## _start(B); }\ -static inline int N ## _start_as_root_with_size(NS ## builder_t *B)\ -{ return NS ## buffer_start_with_size(B, FID) ? -1 : N ## _start(B); }\ -static inline int N ## _start_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, TFID) ? -1 : N ## _start(B); }\ -static inline int N ## _start_as_typed_root_with_size(NS ## builder_t *B)\ -{ return NS ## buffer_start_with_size(B, TFID) ? -1 : N ## _start(B); }\ -static inline NS ## buffer_ref_t N ## _end_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_end(B, N ## _end(B)); }\ -static inline NS ## buffer_ref_t N ## _end_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_end(B, N ## _end(B)); }\ -static inline NS ## buffer_ref_t N ## _create_as_root(NS ## builder_t *B __ ## N ## _formal_args)\ -{ if (NS ## buffer_start(B, FID)) return 0; return NS ## buffer_end(B, N ## _create(B __ ## N ## _call_args)); }\ -static inline NS ## buffer_ref_t N ## _create_as_root_with_size(NS ## builder_t *B __ ## N ## _formal_args)\ -{ if (NS ## buffer_start_with_size(B, FID)) return 0; return NS ## buffer_end(B, N ## _create(B __ ## N ## _call_args)); }\ -static inline NS ## buffer_ref_t N ## _create_as_typed_root(NS ## builder_t *B __ ## N ## _formal_args)\ -{ if (NS ## buffer_start(B, TFID)) return 0; return NS ## buffer_end(B, N ## _create(B __ ## N ## _call_args)); }\ -static inline NS ## buffer_ref_t N ## _create_as_typed_root_with_size(NS ## builder_t *B __ ## N ## _formal_args)\ -{ if (NS ## buffer_start_with_size(B, TFID)) return 0; return NS ## buffer_end(B, N ## _create(B __ ## N ## _call_args)); }\ -static inline NS ## buffer_ref_t N ## _clone_as_root(NS ## builder_t *B, N ## _table_t t)\ -{ if (NS ## buffer_start(B, FID)) return 0; return NS ## buffer_end(B, N ## _clone(B, t)); }\ -static inline NS ## buffer_ref_t N ## _clone_as_root_with_size(NS ## builder_t *B, N ## _table_t t)\ -{ if (NS ## buffer_start_with_size(B, FID)) return 0; return NS ## buffer_end(B, N ## _clone(B, t)); }\ -static inline NS ## buffer_ref_t N ## _clone_as_typed_root(NS ## builder_t *B, N ## _table_t t)\ -{ if (NS ## buffer_start(B, TFID)) return 0;return NS ## buffer_end(B, N ## _clone(B, t)); }\ -static inline NS ## buffer_ref_t N ## _clone_as_typed_root_with_size(NS ## builder_t *B, N ## _table_t t)\ -{ if (NS ## buffer_start_with_size(B, TFID)) return 0; return NS ## buffer_end(B, N ## _clone(B, t)); } - -#define __flatbuffers_build_table_prolog(NS, N, FID, TFID)\ -__flatbuffers_build_table_vector_ops(NS, N ## _vec, N)\ -__flatbuffers_build_table_root(NS, N, FID, TFID) - -#define __flatbuffers_build_struct_root(NS, N, A, FID, TFID)\ -static inline N ## _t *N ## _start_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, FID) ? 0 : N ## _start(B); }\ -static inline N ## _t *N ## _start_as_root_with_size(NS ## builder_t *B)\ -{ return NS ## buffer_start_with_size(B, FID) ? 0 : N ## _start(B); }\ -static inline N ## _t *N ## _start_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, TFID) ? 0 : N ## _start(B); }\ -static inline N ## _t *N ## _start_as_typed_root_with_size(NS ## builder_t *B)\ -{ return NS ## buffer_start_with_size(B, TFID) ? 0 : N ## _start(B); }\ -static inline NS ## buffer_ref_t N ## _end_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_end(B, N ## _end(B)); }\ -static inline NS ## buffer_ref_t N ## _end_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_end(B, N ## _end(B)); }\ -static inline NS ## buffer_ref_t N ## _end_pe_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_end(B, N ## _end_pe(B)); }\ -static inline NS ## buffer_ref_t N ## _end_pe_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_end(B, N ## _end_pe(B)); }\ -static inline NS ## buffer_ref_t N ## _create_as_root(NS ## builder_t *B __ ## N ## _formal_args)\ -{ return flatcc_builder_create_buffer(B, FID, 0,\ - N ## _create(B __ ## N ## _call_args), A, 0); }\ -static inline NS ## buffer_ref_t N ## _create_as_root_with_size(NS ## builder_t *B __ ## N ## _formal_args)\ -{ return flatcc_builder_create_buffer(B, FID, 0,\ - N ## _create(B __ ## N ## _call_args), A, flatcc_builder_with_size); }\ -static inline NS ## buffer_ref_t N ## _create_as_typed_root(NS ## builder_t *B __ ## N ## _formal_args)\ -{ return flatcc_builder_create_buffer(B, TFID, 0,\ - N ## _create(B __ ## N ## _call_args), A, 0); }\ -static inline NS ## buffer_ref_t N ## _create_as_typed_root_with_size(NS ## builder_t *B __ ## N ## _formal_args)\ -{ return flatcc_builder_create_buffer(B, TFID, 0,\ - N ## _create(B __ ## N ## _call_args), A, flatcc_builder_with_size); }\ -static inline NS ## buffer_ref_t N ## _clone_as_root(NS ## builder_t *B, N ## _struct_t p)\ -{ return flatcc_builder_create_buffer(B, FID, 0, N ## _clone(B, p), A, 0); }\ -static inline NS ## buffer_ref_t N ## _clone_as_root_with_size(NS ## builder_t *B, N ## _struct_t p)\ -{ return flatcc_builder_create_buffer(B, FID, 0, N ## _clone(B, p), A, flatcc_builder_with_size); }\ -static inline NS ## buffer_ref_t N ## _clone_as_typed_root(NS ## builder_t *B, N ## _struct_t p)\ -{ return flatcc_builder_create_buffer(B, TFID, 0, N ## _clone(B, p), A, 0); }\ -static inline NS ## buffer_ref_t N ## _clone_as_typed_root_with_size(NS ## builder_t *B, N ## _struct_t p)\ -{ return flatcc_builder_create_buffer(B, TFID, 0, N ## _clone(B, p), A, flatcc_builder_with_size); } - -#define __flatbuffers_build_nested_table_root(NS, N, TN, FID, TFID)\ -static inline int N ## _start_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, FID) ? -1 : TN ## _start(B); }\ -static inline int N ## _start_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, TFID) ? -1 : TN ## _start(B); }\ -static inline int N ## _end_as_root(NS ## builder_t *B)\ -{ return N ## _add(B, NS ## buffer_end(B, TN ## _end(B))); }\ -static inline int N ## _end_as_typed_root(NS ## builder_t *B)\ -{ return N ## _add(B, NS ## buffer_end(B, TN ## _end(B))); }\ -static inline int N ## _nest(NS ## builder_t *B, void *data, size_t size, uint16_t align)\ -{ return N ## _add(B, flatcc_builder_create_vector(B, data, size, 1,\ - align ? align : 8, FLATBUFFERS_COUNT_MAX(1))); }\ -static inline int N ## _typed_nest(NS ## builder_t *B, void *data, size_t size, uint16_t align)\ -{ return N ## _add(B, flatcc_builder_create_vector(B, data, size, 1,\ - align ? align : 8, FLATBUFFERS_COUNT_MAX(1))); }\ -static inline int N ## _clone_as_root(NS ## builder_t *B, TN ## _table_t t)\ -{ return N ## _add(B, TN ## _clone_as_root(B, t)); }\ -static inline int N ## _clone_as_typed_root(NS ## builder_t *B, TN ## _table_t t)\ -{ return N ## _add(B, TN ## _clone_as_typed_root(B, t)); } - -#define __flatbuffers_build_nested_struct_root(NS, N, TN, A, FID, TFID)\ -static inline TN ## _t *N ## _start_as_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, FID) ? 0 : TN ## _start(B); }\ -static inline TN ## _t *N ## _start_as_typed_root(NS ## builder_t *B)\ -{ return NS ## buffer_start(B, FID) ? 0 : TN ## _start(B); }\ -static inline int N ## _end_as_root(NS ## builder_t *B)\ -{ return N ## _add(B, NS ## buffer_end(B, TN ## _end(B))); }\ -static inline int N ## _end_as_typed_root(NS ## builder_t *B)\ -{ return N ## _add(B, NS ## buffer_end(B, TN ## _end(B))); }\ -static inline int N ## _end_pe_as_root(NS ## builder_t *B)\ -{ return N ## _add(B, NS ## buffer_end(B, TN ## _end_pe(B))); }\ -static inline int N ## _create_as_root(NS ## builder_t *B __ ## TN ## _formal_args)\ -{ return N ## _add(B, flatcc_builder_create_buffer(B, FID, 0,\ - TN ## _create(B __ ## TN ## _call_args), A, flatcc_builder_is_nested)); }\ -static inline int N ## _create_as_typed_root(NS ## builder_t *B __ ## TN ## _formal_args)\ -{ return N ## _add(B, flatcc_builder_create_buffer(B, TFID, 0,\ - TN ## _create(B __ ## TN ## _call_args), A, flatcc_builder_is_nested)); }\ -static inline int N ## _nest(NS ## builder_t *B, void *data, size_t size, uint16_t align)\ -{ return N ## _add(B, flatcc_builder_create_vector(B, data, size, 1,\ - align < A ? A : align, FLATBUFFERS_COUNT_MAX(1))); }\ -static inline int N ## _typed_nest(NS ## builder_t *B, void *data, size_t size, uint16_t align)\ -{ return N ## _add(B, flatcc_builder_create_vector(B, data, size, 1,\ - align < A ? A : align, FLATBUFFERS_COUNT_MAX(1))); }\ -static inline int N ## _clone_as_root(NS ## builder_t *B, TN ## _struct_t p)\ -{ return N ## _add(B, TN ## _clone_as_root(B, p)); }\ -static inline int N ## _clone_as_typed_root(NS ## builder_t *B, TN ## _struct_t p)\ -{ return N ## _add(B, TN ## _clone_as_typed_root(B, p)); } - -#define __flatbuffers_build_vector_ops(NS, V, N, TN, T)\ -static inline T *V ## _extend(NS ## builder_t *B, size_t len)\ -{ return (T *)flatcc_builder_extend_vector(B, len); }\ -static inline T *V ## _append(NS ## builder_t *B, const T *data, size_t len)\ -{ return (T *)flatcc_builder_append_vector(B, data, len); }\ -static inline int V ## _truncate(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_truncate_vector(B, len); }\ -static inline T *V ## _edit(NS ## builder_t *B)\ -{ return (T *)flatcc_builder_vector_edit(B); }\ -static inline size_t V ## _reserved_len(NS ## builder_t *B)\ -{ return flatcc_builder_vector_count(B); }\ -static inline T *V ## _push(NS ## builder_t *B, const T *p)\ -{ T *_p; return (_p = (T *)flatcc_builder_extend_vector(B, 1)) ? (memcpy(_p, p, TN ## __size()), _p) : 0; }\ -static inline T *V ## _push_copy(NS ## builder_t *B, const T *p)\ -{ T *_p; return (_p = (T *)flatcc_builder_extend_vector(B, 1)) ? TN ## _copy(_p, p) : 0; }\ -static inline T *V ## _push_clone(NS ## builder_t *B, const T *p)\ -{ T *_p; return (_p = (T *)flatcc_builder_extend_vector(B, 1)) ? TN ## _copy(_p, p) : 0; }\ -static inline T *V ## _push_create(NS ## builder_t *B __ ## TN ## _formal_args)\ -{ T *_p; return (_p = (T *)flatcc_builder_extend_vector(B, 1)) ? TN ## _assign(_p __ ## TN ## _call_args) : 0; } - -#define __flatbuffers_build_vector(NS, N, T, S, A)\ -typedef NS ## ref_t N ## _vec_ref_t;\ -static inline int N ## _vec_start(NS ## builder_t *B)\ -{ return flatcc_builder_start_vector(B, S, A, FLATBUFFERS_COUNT_MAX(S)); }\ -static inline N ## _vec_ref_t N ## _vec_end_pe(NS ## builder_t *B)\ -{ return flatcc_builder_end_vector(B); }\ -static inline N ## _vec_ref_t N ## _vec_end(NS ## builder_t *B)\ -{ if (!NS ## is_native_pe()) { size_t i, n; T *p = (T *)flatcc_builder_vector_edit(B);\ - for (i = 0, n = flatcc_builder_vector_count(B); i < n; ++i)\ - { N ## _to_pe(N ## __ptr_add(p, i)); }} return flatcc_builder_end_vector(B); }\ -static inline N ## _vec_ref_t N ## _vec_create_pe(NS ## builder_t *B, const T *data, size_t len)\ -{ return flatcc_builder_create_vector(B, data, len, S, A, FLATBUFFERS_COUNT_MAX(S)); }\ -static inline N ## _vec_ref_t N ## _vec_create(NS ## builder_t *B, const T *data, size_t len)\ -{ if (!NS ## is_native_pe()) { size_t i; T *p; int ret = flatcc_builder_start_vector(B, S, A, FLATBUFFERS_COUNT_MAX(S)); if (ret) { return ret; }\ - p = (T *)flatcc_builder_extend_vector(B, len); if (!p) return 0;\ - for (i = 0; i < len; ++i) { N ## _copy_to_pe(N ## __ptr_add(p, i), N ## __const_ptr_add(data, i)); }\ - return flatcc_builder_end_vector(B); } else return flatcc_builder_create_vector(B, data, len, S, A, FLATBUFFERS_COUNT_MAX(S)); }\ -static inline N ## _vec_ref_t N ## _vec_clone(NS ## builder_t *B, N ##_vec_t vec)\ -{ __flatbuffers_memoize(B, vec, flatcc_builder_create_vector(B, vec, N ## _vec_len(vec), S, A, FLATBUFFERS_COUNT_MAX(S))); }\ -static inline N ## _vec_ref_t N ## _vec_slice(NS ## builder_t *B, N ##_vec_t vec, size_t index, size_t len)\ -{ size_t n = N ## _vec_len(vec); if (index >= n) index = n; n -= index; if (len > n) len = n;\ - return flatcc_builder_create_vector(B, N ## __const_ptr_add(vec, index), len, S, A, FLATBUFFERS_COUNT_MAX(S)); }\ -__flatbuffers_build_vector_ops(NS, N ## _vec, N, N, T) - -#define __flatbuffers_build_union_vector_ops(NS, V, N, TN)\ -static inline TN ## _union_ref_t *V ## _extend(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_extend_union_vector(B, len); }\ -static inline TN ## _union_ref_t *V ## _append(NS ## builder_t *B, const TN ## _union_ref_t *data, size_t len)\ -{ return flatcc_builder_append_union_vector(B, data, len); }\ -static inline int V ## _truncate(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_truncate_union_vector(B, len); }\ -static inline TN ## _union_ref_t *V ## _edit(NS ## builder_t *B)\ -{ return (TN ## _union_ref_t *) flatcc_builder_union_vector_edit(B); }\ -static inline size_t V ## _reserved_len(NS ## builder_t *B)\ -{ return flatcc_builder_union_vector_count(B); }\ -static inline TN ## _union_ref_t *V ## _push(NS ## builder_t *B, const TN ## _union_ref_t ref)\ -{ return flatcc_builder_union_vector_push(B, ref); }\ -static inline TN ## _union_ref_t *V ## _push_clone(NS ## builder_t *B, TN ## _union_t u)\ -{ return TN ## _vec_push(B, TN ## _clone(B, u)); } - -#define __flatbuffers_build_union_vector(NS, N)\ -static inline int N ## _vec_start(NS ## builder_t *B)\ -{ return flatcc_builder_start_union_vector(B); }\ -static inline N ## _union_vec_ref_t N ## _vec_end(NS ## builder_t *B)\ -{ return flatcc_builder_end_union_vector(B); }\ -static inline N ## _union_vec_ref_t N ## _vec_create(NS ## builder_t *B, const N ## _union_ref_t *data, size_t len)\ -{ return flatcc_builder_create_union_vector(B, data, len); }\ -__flatbuffers_build_union_vector_ops(NS, N ## _vec, N, N)\ -/* Preserves DAG structure separately for type and value vector, so a type vector could be shared for many value vectors. */\ -static inline N ## _union_vec_ref_t N ## _vec_clone(NS ## builder_t *B, N ##_union_vec_t vec)\ -{ N ## _union_vec_ref_t _uvref, _ret = { 0, 0 }; NS ## union_ref_t _uref; size_t _i, _len; flatcc_builder_ref_t *_p;\ - if (vec.type == 0) return _ret;\ - _uvref.type = flatcc_builder_refmap_find(B, vec.type); _uvref.value = flatcc_builder_refmap_find(B, vec.value);\ - _len = N ## _union_vec_len(vec); if (_uvref.type == 0) {\ - _uvref.type = flatcc_builder_refmap_insert(B, vec.type, (flatcc_builder_create_type_vector(B, vec.type, _len))); }\ - if (_uvref.type == 0) return _ret; if (_uvref.value == 0) {\ - if (flatcc_builder_start_offset_vector(B)) return _ret;\ - _p = flatcc_builder_extend_offset_vector(B, _len); if (!_p) return _ret;\ - for (_i = 0; _i < _len; ++_i) { _uref = N ## _clone(B, N ## _union_vec_at(vec, _i)); _p[_i] = _uref.value; }\ - _uvref.value = flatcc_builder_refmap_insert(B, vec.value, flatcc_builder_end_offset_vector(B));\ - if (_uvref.value == 0) return _ret; } return _uvref; } - -#define __flatbuffers_build_string_vector_ops(NS, N)\ -static inline int N ## _push_start(NS ## builder_t *B)\ -{ return NS ## string_start(B); }\ -static inline NS ## string_ref_t *N ## _push_end(NS ## builder_t *B)\ -{ return NS ## string_vec_push(B, NS ## string_end(B)); }\ -static inline NS ## string_ref_t *N ## _push_create(NS ## builder_t *B, const char *s, size_t len)\ -{ return NS ## string_vec_push(B, NS ## string_create(B, s, len)); }\ -static inline NS ## string_ref_t *N ## _push_create_str(NS ## builder_t *B, const char *s)\ -{ return NS ## string_vec_push(B, NS ## string_create_str(B, s)); }\ -static inline NS ## string_ref_t *N ## _push_create_strn(NS ## builder_t *B, const char *s, size_t max_len)\ -{ return NS ## string_vec_push(B, NS ## string_create_strn(B, s, max_len)); }\ -static inline NS ## string_ref_t *N ## _push_clone(NS ## builder_t *B, NS ## string_t string)\ -{ return NS ## string_vec_push(B, NS ## string_clone(B, string)); }\ -static inline NS ## string_ref_t *N ## _push_slice(NS ## builder_t *B, NS ## string_t string, size_t index, size_t len)\ -{ return NS ## string_vec_push(B, NS ## string_slice(B, string, index, len)); } - -#define __flatbuffers_build_table_vector_ops(NS, N, TN)\ -static inline int N ## _push_start(NS ## builder_t *B)\ -{ return TN ## _start(B); }\ -static inline TN ## _ref_t *N ## _push_end(NS ## builder_t *B)\ -{ return N ## _push(B, TN ## _end(B)); }\ -static inline TN ## _ref_t *N ## _push_create(NS ## builder_t *B __ ## TN ##_formal_args)\ -{ return N ## _push(B, TN ## _create(B __ ## TN ## _call_args)); } - -#define __flatbuffers_build_offset_vector_ops(NS, V, N, TN)\ -static inline TN ## _ref_t *V ## _extend(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_extend_offset_vector(B, len); }\ -static inline TN ## _ref_t *V ## _append(NS ## builder_t *B, const TN ## _ref_t *data, size_t len)\ -{ return flatcc_builder_append_offset_vector(B, data, len); }\ -static inline int V ## _truncate(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_truncate_offset_vector(B, len); }\ -static inline TN ## _ref_t *V ## _edit(NS ## builder_t *B)\ -{ return (TN ## _ref_t *)flatcc_builder_offset_vector_edit(B); }\ -static inline size_t V ## _reserved_len(NS ## builder_t *B)\ -{ return flatcc_builder_offset_vector_count(B); }\ -static inline TN ## _ref_t *V ## _push(NS ## builder_t *B, const TN ## _ref_t ref)\ -{ return ref ? flatcc_builder_offset_vector_push(B, ref) : 0; } - -#define __flatbuffers_build_offset_vector(NS, N)\ -typedef NS ## ref_t N ## _vec_ref_t;\ -static inline int N ## _vec_start(NS ## builder_t *B)\ -{ return flatcc_builder_start_offset_vector(B); }\ -static inline N ## _vec_ref_t N ## _vec_end(NS ## builder_t *B)\ -{ return flatcc_builder_end_offset_vector(B); }\ -static inline N ## _vec_ref_t N ## _vec_create(NS ## builder_t *B, const N ## _ref_t *data, size_t len)\ -{ return flatcc_builder_create_offset_vector(B, data, len); }\ -__flatbuffers_build_offset_vector_ops(NS, N ## _vec, N, N)\ -static inline N ## _vec_ref_t N ## _vec_clone(NS ## builder_t *B, N ##_vec_t vec)\ -{ int _ret; N ## _ref_t *_p; size_t _i, _len; __flatbuffers_memoize_begin(B, vec);\ - _len = N ## _vec_len(vec); if (flatcc_builder_start_offset_vector(B)) return 0;\ - _p = flatcc_builder_extend_offset_vector(B, _len); if (!_p) return 0;\ - for (_i = 0; _i < _len; ++_i) { if (!(_p[_i] = N ## _clone(B, N ## _vec_at(vec, _i)))) return 0; }\ - __flatbuffers_memoize_end(B, vec, flatcc_builder_end_offset_vector(B)); }\ - -#define __flatbuffers_build_string_ops(NS, N)\ -static inline char *N ## _append(NS ## builder_t *B, const char *s, size_t len)\ -{ return flatcc_builder_append_string(B, s, len); }\ -static inline char *N ## _append_str(NS ## builder_t *B, const char *s)\ -{ return flatcc_builder_append_string_str(B, s); }\ -static inline char *N ## _append_strn(NS ## builder_t *B, const char *s, size_t len)\ -{ return flatcc_builder_append_string_strn(B, s, len); }\ -static inline size_t N ## _reserved_len(NS ## builder_t *B)\ -{ return flatcc_builder_string_len(B); }\ -static inline char *N ## _extend(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_extend_string(B, len); }\ -static inline char *N ## _edit(NS ## builder_t *B)\ -{ return flatcc_builder_string_edit(B); }\ -static inline int N ## _truncate(NS ## builder_t *B, size_t len)\ -{ return flatcc_builder_truncate_string(B, len); } - -#define __flatbuffers_build_string(NS)\ -typedef NS ## ref_t NS ## string_ref_t;\ -static inline int NS ## string_start(NS ## builder_t *B)\ -{ return flatcc_builder_start_string(B); }\ -static inline NS ## string_ref_t NS ## string_end(NS ## builder_t *B)\ -{ return flatcc_builder_end_string(B); }\ -static inline NS ## ref_t NS ## string_create(NS ## builder_t *B, const char *s, size_t len)\ -{ return flatcc_builder_create_string(B, s, len); }\ -static inline NS ## ref_t NS ## string_create_str(NS ## builder_t *B, const char *s)\ -{ return flatcc_builder_create_string_str(B, s); }\ -static inline NS ## ref_t NS ## string_create_strn(NS ## builder_t *B, const char *s, size_t len)\ -{ return flatcc_builder_create_string_strn(B, s, len); }\ -static inline NS ## string_ref_t NS ## string_clone(NS ## builder_t *B, NS ## string_t string)\ -{ __flatbuffers_memoize(B, string, flatcc_builder_create_string(B, string, NS ## string_len(string))); }\ -static inline NS ## string_ref_t NS ## string_slice(NS ## builder_t *B, NS ## string_t string, size_t index, size_t len)\ -{ size_t n = NS ## string_len(string); if (index >= n) index = n; n -= index; if (len > n) len = n;\ - return flatcc_builder_create_string(B, string + index, len); }\ -__flatbuffers_build_string_ops(NS, NS ## string)\ -__flatbuffers_build_offset_vector(NS, NS ## string) - -#define __flatbuffers_copy_from_pe(P, P2, N) (*(P) = N ## _cast_from_pe(*P2), (P)) -#define __flatbuffers_from_pe(P, N) (*(P) = N ## _cast_from_pe(*P), (P)) -#define __flatbuffers_copy_to_pe(P, P2, N) (*(P) = N ## _cast_to_pe(*P2), (P)) -#define __flatbuffers_to_pe(P, N) (*(P) = N ## _cast_to_pe(*P), (P)) -#define __flatbuffers_define_scalar_primitives(NS, N, T)\ -static inline T *N ## _from_pe(T *p) { return __ ## NS ## from_pe(p, N); }\ -static inline T *N ## _to_pe(T *p) { return __ ## NS ## to_pe(p, N); }\ -static inline T *N ## _copy(T *p, const T *p2) { *p = *p2; return p; }\ -static inline T *N ## _copy_from_pe(T *p, const T *p2)\ -{ return __ ## NS ## copy_from_pe(p, p2, N); }\ -static inline T *N ## _copy_to_pe(T *p, const T *p2) \ -{ return __ ## NS ## copy_to_pe(p, p2, N); }\ -static inline T *N ## _assign(T *p, const T v0) { *p = v0; return p; }\ -static inline T *N ## _assign_from_pe(T *p, T v0)\ -{ *p = N ## _cast_from_pe(v0); return p; }\ -static inline T *N ## _assign_to_pe(T *p, T v0)\ -{ *p = N ## _cast_to_pe(v0); return p; } -#define __flatbuffers_build_scalar(NS, N, T)\ -__ ## NS ## define_scalar_primitives(NS, N, T)\ -__ ## NS ## build_vector(NS, N, T, sizeof(T), sizeof(T)) -/* Depends on generated copy_to/from_pe functions, and the type. */ -#define __flatbuffers_define_struct_primitives(NS, N)\ -static inline N ## _t *N ##_to_pe(N ## _t *p)\ -{ if (!NS ## is_native_pe()) { N ## _copy_to_pe(p, p); }; return p; }\ -static inline N ## _t *N ##_from_pe(N ## _t *p)\ -{ if (!NS ## is_native_pe()) { N ## _copy_from_pe(p, p); }; return p; }\ -static inline N ## _t *N ## _clear(N ## _t *p) { return (N ## _t *)memset(p, 0, N ## __size()); } - -/* Depends on generated copy/assign_to/from_pe functions, and the type. */ -#define __flatbuffers_build_struct(NS, N, S, A, FID, TFID)\ -__ ## NS ## define_struct_primitives(NS, N)\ -typedef NS ## ref_t N ## _ref_t;\ -static inline N ## _t *N ## _start(NS ## builder_t *B)\ -{ return (N ## _t *)flatcc_builder_start_struct(B, S, A); }\ -static inline N ## _ref_t N ## _end(NS ## builder_t *B)\ -{ if (!NS ## is_native_pe()) { N ## _to_pe((N ## _t *)flatcc_builder_struct_edit(B)); }\ - return flatcc_builder_end_struct(B); }\ -static inline N ## _ref_t N ## _end_pe(NS ## builder_t *B)\ -{ return flatcc_builder_end_struct(B); }\ -static inline N ## _ref_t N ## _create(NS ## builder_t *B __ ## N ## _formal_args)\ -{ N ## _t *_p = N ## _start(B); if (!_p) return 0; N ##_assign_to_pe(_p __ ## N ## _call_args);\ - return N ## _end_pe(B); }\ -static inline N ## _ref_t N ## _clone(NS ## builder_t *B, N ## _struct_t p)\ -{ N ## _t *_p; __flatbuffers_memoize_begin(B, p); _p = N ## _start(B); if (!_p) return 0;\ - N ## _copy(_p, p); __flatbuffers_memoize_end(B, p, N ##_end_pe(B)); }\ -__flatbuffers_build_vector(NS, N, N ## _t, S, A)\ -__flatbuffers_build_struct_root(NS, N, A, FID, TFID) - -#define __flatbuffers_build_table(NS, N, K)\ -static inline int N ## _start(NS ## builder_t *B)\ -{ return flatcc_builder_start_table(B, K); }\ -static inline N ## _ref_t N ## _end(NS ## builder_t *B)\ -{ assert(flatcc_builder_check_required(B, __ ## N ## _required,\ - sizeof(__ ## N ## _required) / sizeof(__ ## N ## _required[0]) - 1));\ - return flatcc_builder_end_table(B); }\ -__flatbuffers_build_offset_vector(NS, N) - -#define __flatbuffers_build_table_field(ID, NS, N, TN, TT)\ -static inline int N ## _add(NS ## builder_t *B, TN ## _ref_t ref)\ -{ TN ## _ref_t *_p; return (ref && (_p = flatcc_builder_table_add_offset(B, ID))) ?\ - ((*_p = ref), 0) : -1; }\ -static inline int N ## _start(NS ## builder_t *B)\ -{ return TN ## _start(B); }\ -static inline int N ## _end(NS ## builder_t *B)\ -{ return N ## _add(B, TN ## _end(B)); }\ -static inline TN ## _ref_t N ## _create(NS ## builder_t *B __ ## TN ##_formal_args)\ -{ return N ## _add(B, TN ## _create(B __ ## TN ## _call_args)); }\ -static inline int N ## _clone(NS ## builder_t *B, TN ## _table_t p)\ -{ return N ## _add(B, TN ## _clone(B, p)); }\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ TN ## _table_t _p = N ## _get(t); return _p ? N ## _clone(B, _p) : 0; } - -#define __flatbuffers_build_union_field(ID, NS, N, TN, TT)\ -static inline int N ## _add(NS ## builder_t *B, TN ## _union_ref_t uref)\ -{ NS ## ref_t *_p; TN ## _union_type_t *_pt; if (uref.type == TN ## _NONE) return 0; if (uref.value == 0) return -1;\ - if (!(_pt = (TN ## _union_type_t *)flatcc_builder_table_add(B, ID - 1, sizeof(*_pt), sizeof(*_pt))) ||\ - !(_p = flatcc_builder_table_add_offset(B, ID))) return -1; *_pt = uref.type; *_p = uref.value; return 0; }\ -static inline int N ## _add_type(NS ## builder_t *B, TN ## _union_type_t type)\ -{ TN ## _union_type_t *_pt; if (type == TN ## _NONE) return 0; return (_pt = (TN ## _union_type_t *)flatcc_builder_table_add(B, ID - 1,\ - sizeof(*_pt), sizeof(*_pt))) ? ((*_pt = type), 0) : -1; }\ -static inline int N ## _add_value(NS ## builder_t *B, TN ## _union_ref_t uref)\ -{ NS ## ref_t *p; if (uref.type == TN ## _NONE) return 0; return (p = flatcc_builder_table_add_offset(B, ID)) ?\ - ((*p = uref.value), 0) : -1; }\ -static inline int N ## _clone(NS ## builder_t *B, TN ## _union_t p)\ -{ return N ## _add(B, TN ## _clone(B, p)); }\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ TN ## _union_t _p = N ## _union(t); return _p.type ? N ## _clone(B, _p) : 0; } - -/* M is the union value name and T is its type, i.e. the qualified name. */ -#define __flatbuffers_build_union_table_value_field(NS, N, NU, M, T)\ -static inline int N ## _ ## M ## _add(NS ## builder_t *B, T ## _ref_t ref)\ -{ return N ## _add(B, NU ## _as_ ## M (ref)); }\ -static inline int N ## _ ## M ## _start(NS ## builder_t *B)\ -{ return T ## _start(B); }\ -static inline int N ## _ ## M ## _end(NS ## builder_t *B)\ -{ T ## _ref_t ref = T ## _end(B);\ - return ref ? N ## _ ## M ## _add(B, ref) : -1; }\ -static inline int N ## _ ## M ## _create(NS ## builder_t *B __ ## T ##_formal_args)\ -{ T ## _ref_t ref = T ## _create(B __ ## T ## _call_args);\ - return ref ? N ## _add(B, NU ## _as_ ## M(ref)) : -1; }\ -static inline int N ## _ ## M ## _clone(NS ## builder_t *B, T ## _table_t t)\ -{ T ## _ref_t ref = T ## _clone(B, t);\ - return ref ? N ## _add(B, NU ## _as_ ## M(ref)) : -1; } - -/* M is the union value name and T is its type, i.e. the qualified name. */ -#define __flatbuffers_build_union_struct_value_field(NS, N, NU, M, T)\ -static inline int N ## _ ## M ## _add(NS ## builder_t *B, T ## _ref_t ref)\ -{ return N ## _add(B, NU ## _as_ ## M (ref)); }\ -static inline T ## _t *N ## _ ## M ## _start(NS ## builder_t *B)\ -{ return T ## _start(B); }\ -static inline int N ## _ ## M ## _end(NS ## builder_t *B)\ -{ T ## _ref_t ref = T ## _end(B);\ - return ref ? N ## _ ## M ## _add(B, ref) : -1; }\ -static inline int N ## _ ## M ## _create(NS ## builder_t *B __ ## T ##_formal_args)\ -{ T ## _ref_t ref = T ## _create(B __ ## T ## _call_args);\ - return ref ? N ## _add(B, NU ## _as_ ## M(ref)) : -1; }\ -static inline int N ## _ ## M ## _end_pe(NS ## builder_t *B)\ -{ T ## _ref_t ref = T ## _end_pe(B);\ - return ref ? N ## _add(B, NU ## _as_ ## M(ref)) : -1; }\ -static inline int N ## _ ## M ## _clone(NS ## builder_t *B, T ## _struct_t p)\ -{ T ## _ref_t ref = T ## _clone(B, p);\ - return ref ? N ## _add(B, NU ## _as_ ## M(ref)) : -1; } -#define __flatbuffers_build_union_string_value_field(NS, N, NU, M)\ -static inline int N ## _ ## M ## _add(NS ## builder_t *B, NS ## string_ref_t ref)\ -{ return N ## _add(B, NU ## _as_ ## M (ref)); }\ -__flatbuffers_build_string_field_ops(NS, N ## _ ## M) - -/* NS: common namespace, ID: table field id (not offset), TN: name of type T, TT: name of table type - * S: sizeof of scalar type, A: alignment of type T, default value V of type T. */ -#define __flatbuffers_build_scalar_field(ID, NS, N, TN, T, S, A, V, TT)\ -static inline int N ## _add(NS ## builder_t *B, const T v)\ -{ T *_p; if (v == V) return 0; if (!(_p = (T *)flatcc_builder_table_add(B, ID, S, A))) return -1;\ - TN ## _assign_to_pe(_p, v); return 0; }\ -static inline int N ## _force_add(NS ## builder_t *B, const T v)\ -{ T *_p; if (!(_p = (T *)flatcc_builder_table_add(B, ID, S, A))) return -1;\ - TN ## _assign_to_pe(_p, v); return 0; }\ -/* Clone does not skip default values and expects pe endian content. */\ -static inline int N ## _clone(NS ## builder_t *B, const T *p)\ -{ return 0 == flatcc_builder_table_add_copy(B, ID, p, S, A) ? -1 : 0; }\ -/* Transferring a missing field is a nop success with 0 as result. */\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ const T *_p = N ## _get_ptr(t); return _p ? N ## _clone(B, _p) : 0; } - -#define __flatbuffers_build_struct_field(ID, NS, N, TN, S, A, TT)\ -static inline TN ## _t *N ## _start(NS ## builder_t *B)\ -{ return (TN ## _t *)flatcc_builder_table_add(B, ID, S, A); }\ -static inline int N ## _end(NS ## builder_t *B)\ -{ if (!NS ## is_native_pe()) { TN ## _to_pe((TN ## _t *)flatcc_builder_table_edit(B, S)); } return 0; }\ -static inline int N ## _end_pe(NS ## builder_t *B) { return 0; }\ -static inline int N ## _create(NS ## builder_t *B __ ## TN ## _formal_args)\ -{ TN ## _t *_p = N ## _start(B); if (!_p) return 0; TN ##_assign_to_pe(_p __ ## TN ## _call_args);\ - return 0; }\ -static inline int N ## _add(NS ## builder_t *B, const TN ## _t *p)\ -{ TN ## _t *_p = N ## _start(B); if (!_p) return -1; TN ##_copy_to_pe(_p, p); return 0; }\ -static inline int N ## _clone(NS ## builder_t *B, TN ## _struct_t p)\ -{ return 0 == flatcc_builder_table_add_copy(B, ID, p, S, A) ? -1 : 0; }\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ TN ## _struct_t _p = N ## _get(t); return _p ? N ## _clone(B, _p) : 0; } - -#define __flatbuffers_build_vector_field(ID, NS, N, TN, T, TT)\ -static inline int N ## _add(NS ## builder_t *B, TN ## _vec_ref_t ref)\ -{ TN ## _vec_ref_t *_p; return (ref && (_p = flatcc_builder_table_add_offset(B, ID))) ? ((*_p = ref), 0) : -1; }\ -static inline int N ## _start(NS ## builder_t *B)\ -{ return TN ## _vec_start(B); }\ -static inline int N ## _end_pe(NS ## builder_t *B)\ -{ return N ## _add(B, TN ## _vec_end_pe(B)); }\ -static inline int N ## _end(NS ## builder_t *B)\ -{ return N ## _add(B, TN ## _vec_end(B)); }\ -static inline int N ## _create_pe(NS ## builder_t *B, T *data, size_t len)\ -{ return N ## _add(B, TN ## _vec_create_pe(B, data, len)); }\ -static inline int N ## _create(NS ## builder_t *B, T *data, size_t len)\ -{ return N ## _add(B, TN ## _vec_create(B, data, len)); }\ -static inline int N ## _slice(NS ## builder_t *B, TN ## _vec_t vec, size_t index, size_t len)\ -{ return N ## _add(B, TN ## _vec_slice(B, vec, index, len)); }\ -static inline int N ## _clone(NS ## builder_t *B, TN ## _vec_t vec)\ -{ return N ## _add(B, TN ## _vec_clone(B, vec)); }\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ TN ## _vec_t _p = N ## _get(t); return _p ? N ## _clone(B, _p) : 0; }\ -__flatbuffers_build_vector_ops(NS, N, N, TN, T)\ - -#define __flatbuffers_build_offset_vector_field(ID, NS, N, TN, TT)\ -static inline int N ## _add(NS ## builder_t *B, TN ## _vec_ref_t ref)\ -{ TN ## _vec_ref_t *_p; return (ref && (_p = flatcc_builder_table_add_offset(B, ID))) ? ((*_p = ref), 0) : -1; }\ -static inline int N ## _start(NS ## builder_t *B)\ -{ return flatcc_builder_start_offset_vector(B); }\ -static inline int N ## _end(NS ## builder_t *B)\ -{ return N ## _add(B, flatcc_builder_end_offset_vector(B)); }\ -static inline int N ## _create(NS ## builder_t *B, const TN ## _ref_t *data, size_t len)\ -{ return N ## _add(B, flatcc_builder_create_offset_vector(B, data, len)); }\ -__flatbuffers_build_offset_vector_ops(NS, N, N, TN)\ -static inline int N ## _clone(NS ## builder_t *B, TN ## _vec_t vec)\ -{ return N ## _add(B, TN ## _vec_clone(B, vec)); }\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ TN ## _vec_t _p = N ## _get(t); return _p ? N ## _clone(B, _p) : 0; } - -/* depends on N ## _add which differs for union member fields and ordinary fields */\ -#define __flatbuffers_build_string_field_ops(NS, N)\ -static inline int N ## _start(NS ## builder_t *B)\ -{ return flatcc_builder_start_string(B); }\ -static inline int N ## _end(NS ## builder_t *B)\ -{ return N ## _add(B, flatcc_builder_end_string(B)); }\ -static inline int N ## _create(NS ## builder_t *B, const char *s, size_t len)\ -{ return N ## _add(B, flatcc_builder_create_string(B, s, len)); }\ -static inline int N ## _create_str(NS ## builder_t *B, const char *s)\ -{ return N ## _add(B, flatcc_builder_create_string_str(B, s)); }\ -static inline int N ## _create_strn(NS ## builder_t *B, const char *s, size_t max_len)\ -{ return N ## _add(B, flatcc_builder_create_string_strn(B, s, max_len)); }\ -static inline int N ## _clone(NS ## builder_t *B, NS ## string_t string)\ -{ return N ## _add(B, NS ## string_clone(B, string)); }\ -static inline int N ## _slice(NS ## builder_t *B, NS ## string_t string, size_t index, size_t len)\ -{ return N ## _add(B, NS ## string_slice(B, string, index, len)); }\ -__flatbuffers_build_string_ops(NS, N) - -#define __flatbuffers_build_string_field(ID, NS, N, TT)\ -static inline int N ## _add(NS ## builder_t *B, NS ## string_ref_t ref)\ -{ NS ## string_ref_t *_p; return (ref && (_p = flatcc_builder_table_add_offset(B, ID))) ? ((*_p = ref), 0) : -1; }\ -__flatbuffers_build_string_field_ops(NS, N)\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ NS ## string_t _p = N ## _get(t); return _p ? N ## _clone(B, _p) : 0; } - -#define __flatbuffers_build_table_vector_field(ID, NS, N, TN, TT)\ -__flatbuffers_build_offset_vector_field(ID, NS, N, TN, TT)\ -__flatbuffers_build_table_vector_ops(NS, N, TN) - -#define __flatbuffers_build_union_vector_field(ID, NS, N, TN, TT)\ -static inline int N ## _add(NS ## builder_t *B, TN ## _union_vec_ref_t uvref)\ -{ NS ## vec_ref_t *_p; if (!uvref.type || !uvref.value) return uvref.type == uvref.value ? 0 : -1;\ - if (!(_p = flatcc_builder_table_add_offset(B, ID - 1))) return -1; *_p = uvref.type;\ - if (!(_p = flatcc_builder_table_add_offset(B, ID))) return -1; *_p = uvref.value; return 0; }\ -static inline int N ## _start(NS ## builder_t *B)\ -{ return flatcc_builder_start_union_vector(B); }\ -static inline int N ## _end(NS ## builder_t *B)\ -{ return N ## _add(B, flatcc_builder_end_union_vector(B)); }\ -static inline int N ## _create(NS ## builder_t *B, const TN ## _union_ref_t *data, size_t len)\ -{ return N ## _add(B, flatcc_builder_create_union_vector(B, data, len)); }\ -__flatbuffers_build_union_vector_ops(NS, N, N, TN)\ -static inline int N ## _clone(NS ## builder_t *B, TN ## _union_vec_t vec)\ -{ return N ## _add(B, TN ## _vec_clone(B, vec)); }\ -static inline int N ## _pick(NS ## builder_t *B, TT ## _table_t t)\ -{ TN ## _union_vec_t _p = N ## _union(t); return _p.type ? N ## _clone(B, _p) : 0; } - -#define __flatbuffers_build_union_table_vector_value_field(NS, N, NU, M, T)\ -static inline int N ## _ ## M ## _push_start(NS ## builder_t *B)\ -{ return T ## _start(B); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_end(NS ## builder_t *B)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M (T ## _end(B))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push(NS ## builder_t *B, T ## _ref_t ref)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M (ref)); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_create(NS ## builder_t *B __ ## T ##_formal_args)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(T ## _create(B __ ## T ## _call_args))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_clone(NS ## builder_t *B, T ## _table_t t)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(T ## _clone(B, t))); } - -#define __flatbuffers_build_union_struct_vector_value_field(NS, N, NU, M, T)\ -static inline T ## _t *N ## _ ## M ## _push_start(NS ## builder_t *B)\ -{ return T ## _start(B); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_end(NS ## builder_t *B)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M (T ## _end(B))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push(NS ## builder_t *B, T ## _ref_t ref)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M (ref)); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_create(NS ## builder_t *B __ ## T ##_formal_args)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(T ## _create(B __ ## T ## _call_args))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_clone(NS ## builder_t *B, T ## _struct_t p)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(T ## _clone(B, p))); } - -#define __flatbuffers_build_union_string_vector_value_field(NS, N, NU, M)\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push(NS ## builder_t *B, NS ## string_ref_t ref)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M (ref)); }\ -static inline int N ## _ ## M ## _push_start(NS ## builder_t *B)\ -{ return NS ## string_start(B); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_end(NS ## builder_t *B)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(NS ## string_end(B))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_create(NS ## builder_t *B, const char *s, size_t len)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(NS ## string_create(B, s, len))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_create_str(NS ## builder_t *B, const char *s)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(NS ## string_create_str(B, s))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_create_strn(NS ## builder_t *B, const char *s, size_t max_len)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(NS ## string_create_strn(B, s, max_len))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_clone(NS ## builder_t *B, NS ## string_t string)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(NS ## string_clone(B, string))); }\ -static inline NU ## _union_ref_t *N ## _ ## M ## _push_slice(NS ## builder_t *B, NS ## string_t string, size_t index, size_t len)\ -{ return NU ## _vec_push(B, NU ## _as_ ## M(NS ## string_slice(B, string, index, len))); } - -#define __flatbuffers_build_string_vector_field(ID, NS, N, TT)\ -__flatbuffers_build_offset_vector_field(ID, NS, N, NS ## string, TT)\ -__flatbuffers_build_string_vector_ops(NS, N) - -#define __flatbuffers_uint8_formal_args , uint8_t v0 -#define __flatbuffers_uint8_call_args , v0 -#define __flatbuffers_int8_formal_args , int8_t v0 -#define __flatbuffers_int8_call_args , v0 -#define __flatbuffers_bool_formal_args , flatbuffers_bool_t v0 -#define __flatbuffers_bool_call_args , v0 -#define __flatbuffers_uint16_formal_args , uint16_t v0 -#define __flatbuffers_uint16_call_args , v0 -#define __flatbuffers_uint32_formal_args , uint32_t v0 -#define __flatbuffers_uint32_call_args , v0 -#define __flatbuffers_uint64_formal_args , uint64_t v0 -#define __flatbuffers_uint64_call_args , v0 -#define __flatbuffers_int16_formal_args , int16_t v0 -#define __flatbuffers_int16_call_args , v0 -#define __flatbuffers_int32_formal_args , int32_t v0 -#define __flatbuffers_int32_call_args , v0 -#define __flatbuffers_int64_formal_args , int64_t v0 -#define __flatbuffers_int64_call_args , v0 -#define __flatbuffers_float_formal_args , float v0 -#define __flatbuffers_float_call_args , v0 -#define __flatbuffers_double_formal_args , double v0 -#define __flatbuffers_double_call_args , v0 - -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_uint8, uint8_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_int8, int8_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_bool, flatbuffers_bool_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_uint16, uint16_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_uint32, uint32_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_uint64, uint64_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_int16, int16_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_int32, int32_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_int64, int64_t) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_float, float) -__flatbuffers_build_scalar(flatbuffers_, flatbuffers_double, double) - -__flatbuffers_build_string(flatbuffers_) - -__flatbuffers_build_buffer(flatbuffers_) -#include "flatcc/flatcc_epilogue.h" -#endif /* FLATBUFFERS_COMMON_BUILDER_H */ diff --git a/common/src/flatbuffers_common_reader.h b/common/src/flatbuffers_common_reader.h deleted file mode 100644 index 529ecd8d9..000000000 --- a/common/src/flatbuffers_common_reader.h +++ /dev/null @@ -1,512 +0,0 @@ -#ifndef FLATBUFFERS_COMMON_READER_H -#define FLATBUFFERS_COMMON_READER_H - -/* Generated by flatcc 0.5.3-pre FlatBuffers schema compiler for C by dvide.com */ - -/* Common FlatBuffers read functionality for C. */ - -#include "flatcc/flatcc_prologue.h" -#include "flatcc/flatcc_flatbuffers.h" - - -#define __flatbuffers_read_scalar_at_byteoffset(N, p, o) N ## _read_from_pe((uint8_t *)(p) + (o)) -#define __flatbuffers_read_scalar(N, p) N ## _read_from_pe(p) -#define __flatbuffers_read_vt(ID, offset, t)\ -flatbuffers_voffset_t offset = 0;\ -{ flatbuffers_voffset_t id__tmp, *vt__tmp;\ - assert(t != 0 && "null pointer table access");\ - id__tmp = ID;\ - vt__tmp = (flatbuffers_voffset_t *)((uint8_t *)(t) -\ - __flatbuffers_soffset_read_from_pe(t));\ - if (__flatbuffers_voffset_read_from_pe(vt__tmp) >= sizeof(vt__tmp[0]) * (id__tmp + 3)) {\ - offset = __flatbuffers_voffset_read_from_pe(vt__tmp + id__tmp + 2);\ - }\ -} -#define __flatbuffers_field_present(ID, t) { __flatbuffers_read_vt(ID, offset__tmp, t) return offset__tmp != 0; } -#define __flatbuffers_scalar_field(T, ID, t)\ -{\ - __flatbuffers_read_vt(ID, offset__tmp, t)\ - if (offset__tmp) {\ - return (const T *)((uint8_t *)(t) + offset__tmp);\ - }\ - return 0;\ -} -#define __flatbuffers_define_scalar_field(ID, N, NK, TK, T, V)\ -static inline T N ## _ ## NK ## _get(N ## _table_t t__tmp)\ -{ __flatbuffers_read_vt(ID, offset__tmp, t__tmp)\ - return offset__tmp ? __flatbuffers_read_scalar_at_byteoffset(TK, t__tmp, offset__tmp) : V;\ -}\ -static inline T N ## _ ## NK(N ## _table_t t__tmp)\ -{ __flatbuffers_read_vt(ID, offset__tmp, t__tmp)\ - return offset__tmp ? __flatbuffers_read_scalar_at_byteoffset(TK, t__tmp, offset__tmp) : V;\ -}\ -static inline const T *N ## _ ## NK ## _get_ptr(N ## _table_t t__tmp)\ -__flatbuffers_scalar_field(T, ID, t__tmp)\ -static inline int N ## _ ## NK ## _is_present(N ## _table_t t__tmp)\ -__flatbuffers_field_present(ID, t__tmp)\ -__flatbuffers_define_scan_by_scalar_field(N, NK, T) -#define __flatbuffers_struct_field(T, ID, t, r)\ -{\ - __flatbuffers_read_vt(ID, offset__tmp, t)\ - if (offset__tmp) {\ - return (T)((uint8_t *)(t) + offset__tmp);\ - }\ - assert(!(r) && "required field missing");\ - return 0;\ -} -#define __flatbuffers_offset_field(T, ID, t, r, adjust)\ -{\ - flatbuffers_uoffset_t *elem__tmp;\ - __flatbuffers_read_vt(ID, offset__tmp, t)\ - if (offset__tmp) {\ - elem__tmp = (flatbuffers_uoffset_t *)((uint8_t *)(t) + offset__tmp);\ - /* Add sizeof so C api can have raw access past header field. */\ - return (T)((uint8_t *)(elem__tmp) + adjust +\ - __flatbuffers_uoffset_read_from_pe(elem__tmp));\ - }\ - assert(!(r) && "required field missing");\ - return 0;\ -} -#define __flatbuffers_vector_field(T, ID, t, r) __flatbuffers_offset_field(T, ID, t, r, sizeof(flatbuffers_uoffset_t)) -#define __flatbuffers_table_field(T, ID, t, r) __flatbuffers_offset_field(T, ID, t, r, 0) -#define __flatbuffers_define_struct_field(ID, N, NK, T, r)\ -static inline T N ## _ ## NK ## _get(N ## _table_t t__tmp)\ -__flatbuffers_struct_field(T, ID, t__tmp, r)\ -static inline T N ## _ ## NK(N ## _table_t t__tmp)\ -__flatbuffers_struct_field(T, ID, t__tmp, r)\ -static inline int N ## _ ## NK ## _is_present(N ## _table_t t__tmp)\ -__flatbuffers_field_present(ID, t__tmp) -#define __flatbuffers_define_vector_field(ID, N, NK, T, r)\ -static inline T N ## _ ## NK ## _get(N ## _table_t t__tmp)\ -__flatbuffers_vector_field(T, ID, t__tmp, r)\ -static inline T N ## _ ## NK(N ## _table_t t__tmp)\ -__flatbuffers_vector_field(T, ID, t__tmp, r)\ -static inline int N ## _ ## NK ## _is_present(N ## _table_t t__tmp)\ -__flatbuffers_field_present(ID, t__tmp) -#define __flatbuffers_define_table_field(ID, N, NK, T, r)\ -static inline T N ## _ ## NK ## _get(N ## _table_t t__tmp)\ -__flatbuffers_table_field(T, ID, t__tmp, r)\ -static inline T N ## _ ## NK(N ## _table_t t__tmp)\ -__flatbuffers_table_field(T, ID, t__tmp, r)\ -static inline int N ## _ ## NK ## _is_present(N ## _table_t t__tmp)\ -__flatbuffers_field_present(ID, t__tmp) -#define __flatbuffers_define_string_field(ID, N, NK, r)\ -static inline flatbuffers_string_t N ## _ ## NK ## _get(N ## _table_t t__tmp)\ -__flatbuffers_vector_field(flatbuffers_string_t, ID, t__tmp, r)\ -static inline flatbuffers_string_t N ## _ ## NK(N ## _table_t t__tmp)\ -__flatbuffers_vector_field(flatbuffers_string_t, ID, t__tmp, r)\ -static inline int N ## _ ## NK ## _is_present(N ## _table_t t__tmp)\ -__flatbuffers_field_present(ID, t__tmp)\ -__flatbuffers_define_scan_by_string_field(N, NK) -#define __flatbuffers_vec_len(vec)\ -{ return (vec) ? (size_t)__flatbuffers_uoffset_read_from_pe((flatbuffers_uoffset_t *)vec - 1) : 0; } -#define __flatbuffers_string_len(s) __flatbuffers_vec_len(s) -static inline size_t flatbuffers_vec_len(const void *vec) -__flatbuffers_vec_len(vec) -#define __flatbuffers_scalar_vec_at(N, vec, i)\ -{ assert(flatbuffers_vec_len(vec) > (i) && "index out of range");\ - return __flatbuffers_read_scalar(N, &(vec)[i]); } -#define __flatbuffers_struct_vec_at(vec, i)\ -{ assert(flatbuffers_vec_len(vec) > (i) && "index out of range"); return (vec) + (i); } -/* `adjust` skips past the header for string vectors. */ -#define __flatbuffers_offset_vec_at(T, vec, i, adjust)\ -{ const flatbuffers_uoffset_t *elem__tmp = (vec) + (i);\ - assert(flatbuffers_vec_len(vec) > (i) && "index out of range");\ - return (T)((uint8_t *)(elem__tmp) + (size_t)__flatbuffers_uoffset_read_from_pe(elem__tmp) + (adjust)); } -#define __flatbuffers_define_scalar_vec_len(N)\ -static inline size_t N ## _vec_len(N ##_vec_t vec__tmp)\ -{ return flatbuffers_vec_len(vec__tmp); } -#define __flatbuffers_define_scalar_vec_at(N, T) \ -static inline T N ## _vec_at(N ## _vec_t vec__tmp, size_t i__tmp)\ -__flatbuffers_scalar_vec_at(N, vec__tmp, i__tmp) -typedef const char *flatbuffers_string_t; -static inline size_t flatbuffers_string_len(flatbuffers_string_t s) -__flatbuffers_string_len(s) -typedef const flatbuffers_uoffset_t *flatbuffers_string_vec_t; -typedef flatbuffers_uoffset_t *flatbuffers_string_mutable_vec_t; -static inline size_t flatbuffers_string_vec_len(flatbuffers_string_vec_t vec) -__flatbuffers_vec_len(vec) -static inline flatbuffers_string_t flatbuffers_string_vec_at(flatbuffers_string_vec_t vec, size_t i) -__flatbuffers_offset_vec_at(flatbuffers_string_t, vec, i, sizeof(vec[0])) -typedef const void *flatbuffers_generic_t; -static inline flatbuffers_string_t flatbuffers_string_cast_from_generic(const flatbuffers_generic_t p) -{ return p ? ((const char *)p) + __flatbuffers_uoffset__size() : 0; } -typedef const flatbuffers_uoffset_t *flatbuffers_generic_vec_t; -typedef flatbuffers_uoffset_t *flatbuffers_generic_table_mutable_vec_t; -static inline size_t flatbuffers_generic_vec_len(flatbuffers_generic_vec_t vec) -__flatbuffers_vec_len(vec) -static inline flatbuffers_generic_t flatbuffers_generic_vec_at(flatbuffers_generic_vec_t vec, size_t i) -__flatbuffers_offset_vec_at(flatbuffers_generic_t, vec, i, 0) -static inline flatbuffers_generic_t flatbuffers_generic_vec_at_as_string(flatbuffers_generic_vec_t vec, size_t i) -__flatbuffers_offset_vec_at(flatbuffers_generic_t, vec, i, sizeof(vec[0])) -typedef struct flatbuffers_union { - flatbuffers_union_type_t type; - flatbuffers_generic_t value; -} flatbuffers_union_t; -typedef struct flatbuffers_union_vec { - const flatbuffers_union_type_t *type; - const flatbuffers_uoffset_t *value; -} flatbuffers_union_vec_t; -#define __flatbuffers_union_type_field(ID, t)\ -{\ - __flatbuffers_read_vt(ID, offset__tmp, t)\ - return offset__tmp ? __flatbuffers_read_scalar_at_byteoffset(__flatbuffers_utype, t, offset__tmp) : 0;\ -} -static inline flatbuffers_string_t flatbuffers_string_cast_from_union(const flatbuffers_union_t u__tmp)\ -{ return flatbuffers_string_cast_from_generic(u__tmp.value); } -#define __flatbuffers_define_union_field(NS, ID, N, NK, T, r)\ -static inline T ## _union_type_t N ## _ ## NK ## _type_get(N ## _table_t t__tmp)\ -__## NS ## union_type_field(((ID) - 1), t__tmp)\ -static inline NS ## generic_t N ## _ ## NK ## _get(N ## _table_t t__tmp)\ -__## NS ## table_field(NS ## generic_t, ID, t__tmp, r)\ -static inline T ## _union_type_t N ## _ ## NK ## _type(N ## _table_t t__tmp)\ -__## NS ## union_type_field(((ID) - 1), t__tmp)\ -static inline NS ## generic_t N ## _ ## NK(N ## _table_t t__tmp)\ -__## NS ## table_field(NS ## generic_t, ID, t__tmp, r)\ -static inline int N ## _ ## NK ## _is_present(N ## _table_t t__tmp)\ -__## NS ## field_present(ID, t__tmp)\ -static inline T ## _union_t N ## _ ## NK ## _union(N ## _table_t t__tmp)\ -{ T ## _union_t u__tmp = { 0, 0 }; u__tmp.type = N ## _ ## NK ## _type_get(t__tmp);\ - if (u__tmp.type == 0) return u__tmp; u__tmp.value = N ## _ ## NK ## _get(t__tmp); return u__tmp; }\ -static inline NS ## string_t N ## _ ## NK ## _as_string(N ## _table_t t__tmp)\ -{ return NS ## string_cast_from_generic(N ## _ ## NK ## _get(t__tmp)); }\ - -#define __flatbuffers_define_union_vector_ops(NS, T)\ -static inline size_t T ## _union_vec_len(T ## _union_vec_t uv__tmp)\ -{ return NS ## vec_len(uv__tmp.type); }\ -static inline T ## _union_t T ## _union_vec_at(T ## _union_vec_t uv__tmp, size_t i__tmp)\ -{ T ## _union_t u__tmp = { 0, 0 }; size_t n__tmp = NS ## vec_len(uv__tmp.type);\ - assert(n__tmp > (i__tmp) && "index out of range"); u__tmp.type = uv__tmp.type[i__tmp];\ - /* Unknown type is treated as NONE for schema evolution. */\ - if (u__tmp.type == 0) return u__tmp;\ - u__tmp.value = NS ## generic_vec_at(uv__tmp.value, i__tmp); return u__tmp; }\ -static inline NS ## string_t T ## _union_vec_at_as_string(T ## _union_vec_t uv__tmp, size_t i__tmp)\ -{ return (NS ## string_t) NS ## generic_vec_at_as_string(uv__tmp.value, i__tmp); }\ - -#define __flatbuffers_define_union_vector(NS, T)\ -typedef NS ## union_vec_t T ## _union_vec_t;\ -__## NS ## define_union_vector_ops(NS, T) -#define __flatbuffers_define_union(NS, T)\ -typedef NS ## union_t T ## _union_t;\ -__## NS ## define_union_vector(NS, T) -#define __flatbuffers_define_union_vector_field(NS, ID, N, NK, T, r)\ -__## NS ## define_vector_field(ID - 1, N, NK ## _type, T ## _vec_t, r)\ -__## NS ## define_vector_field(ID, N, NK, flatbuffers_generic_vec_t, r)\ -static inline T ## _union_vec_t N ## _ ## NK ## _union(N ## _table_t t__tmp)\ -{ T ## _union_vec_t uv__tmp; uv__tmp.type = N ## _ ## NK ## _type_get(t__tmp);\ - uv__tmp.value = N ## _ ## NK(t__tmp);\ - assert(NS ## vec_len(uv__tmp.type) == NS ## vec_len(uv__tmp.value)\ - && "union vector type length mismatch"); return uv__tmp; } -#include -static size_t flatbuffers_not_found = (size_t)-1; -static size_t flatbuffers_end = (size_t)-1; -#define __flatbuffers_identity(n) (n) -#define __flatbuffers_min(a, b) ((a) < (b) ? (a) : (b)) -/* Subtraction doesn't work for unsigned types. */ -#define __flatbuffers_scalar_cmp(x, y, n) ((x) < (y) ? -1 : (x) > (y)) -static inline int __flatbuffers_string_n_cmp(flatbuffers_string_t v, const char *s, size_t n) -{ size_t nv = flatbuffers_string_len(v); int x = strncmp(v, s, nv < n ? nv : n); - return x != 0 ? x : nv < n ? -1 : nv > n; } -/* `n` arg unused, but needed by string find macro expansion. */ -static inline int __flatbuffers_string_cmp(flatbuffers_string_t v, const char *s, size_t n) { (void)n; return strcmp(v, s); } -/* A = identity if searching scalar vectors rather than key fields. */ -/* Returns lowest matching index or not_found. */ -#define __flatbuffers_find_by_field(A, V, E, L, K, Kn, T, D)\ -{ T v__tmp; size_t a__tmp = 0, b__tmp, m__tmp; if (!(b__tmp = L(V))) { return flatbuffers_not_found; }\ - --b__tmp;\ - while (a__tmp < b__tmp) {\ - m__tmp = a__tmp + ((b__tmp - a__tmp) >> 1);\ - v__tmp = A(E(V, m__tmp));\ - if ((D(v__tmp, (K), (Kn))) < 0) {\ - a__tmp = m__tmp + 1;\ - } else {\ - b__tmp = m__tmp;\ - }\ - }\ - if (a__tmp == b__tmp) {\ - v__tmp = A(E(V, a__tmp));\ - if (D(v__tmp, (K), (Kn)) == 0) {\ - return a__tmp;\ - }\ - }\ - return flatbuffers_not_found;\ -} -#define __flatbuffers_find_by_scalar_field(A, V, E, L, K, T)\ -__flatbuffers_find_by_field(A, V, E, L, K, 0, T, __flatbuffers_scalar_cmp) -#define __flatbuffers_find_by_string_field(A, V, E, L, K)\ -__flatbuffers_find_by_field(A, V, E, L, K, 0, flatbuffers_string_t, __flatbuffers_string_cmp) -#define __flatbuffers_find_by_string_n_field(A, V, E, L, K, Kn)\ -__flatbuffers_find_by_field(A, V, E, L, K, Kn, flatbuffers_string_t, __flatbuffers_string_n_cmp) -#define __flatbuffers_define_find_by_scalar_field(N, NK, TK)\ -static inline size_t N ## _vec_find_by_ ## NK(N ## _vec_t vec__tmp, TK key__tmp)\ -__flatbuffers_find_by_scalar_field(N ## _ ## NK, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, TK) -#define __flatbuffers_define_scalar_find(N, T)\ -static inline size_t N ## _vec_find(N ## _vec_t vec__tmp, T key__tmp)\ -__flatbuffers_find_by_scalar_field(__flatbuffers_identity, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T) -#define __flatbuffers_define_find_by_string_field(N, NK) \ -/* Note: find only works on vectors sorted by this field. */\ -static inline size_t N ## _vec_find_by_ ## NK(N ## _vec_t vec__tmp, const char *s__tmp)\ -__flatbuffers_find_by_string_field(N ## _ ## NK, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp)\ -static inline size_t N ## _vec_find_n_by_ ## NK(N ## _vec_t vec__tmp, const char *s__tmp, int n__tmp)\ -__flatbuffers_find_by_string_n_field(N ## _ ## NK, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp, n__tmp) -#define __flatbuffers_define_default_find_by_scalar_field(N, NK, TK)\ -static inline size_t N ## _vec_find(N ## _vec_t vec__tmp, TK key__tmp)\ -{ return N ## _vec_find_by_ ## NK(vec__tmp, key__tmp); } -#define __flatbuffers_define_default_find_by_string_field(N, NK) \ -static inline size_t N ## _vec_find(N ## _vec_t vec__tmp, const char *s__tmp)\ -{ return N ## _vec_find_by_ ## NK(vec__tmp, s__tmp); }\ -static inline size_t N ## _vec_find_n(N ## _vec_t vec__tmp, const char *s__tmp, int n__tmp)\ -{ return N ## _vec_find_n_by_ ## NK(vec__tmp, s__tmp, n__tmp); } -/* A = identity if searching scalar vectors rather than key fields. */ -/* Returns lowest matching index or not_found. */ -#define __flatbuffers_scan_by_field(b, e, A, V, E, L, K, Kn, T, D)\ -{ T v__tmp; size_t i__tmp;\ - for (i__tmp = b; i__tmp < e; ++i__tmp) {\ - v__tmp = A(E(V, i__tmp));\ - if (D(v__tmp, (K), (Kn)) == 0) {\ - return i__tmp;\ - }\ - }\ - return flatbuffers_not_found;\ -} -#define __flatbuffers_rscan_by_field(b, e, A, V, E, L, K, Kn, T, D)\ -{ T v__tmp; size_t i__tmp = e;\ - while (i__tmp-- > b) {\ - v__tmp = A(E(V, i__tmp));\ - if (D(v__tmp, (K), (Kn)) == 0) {\ - return i__tmp;\ - }\ - }\ - return flatbuffers_not_found;\ -} -#define __flatbuffers_scan_by_scalar_field(b, e, A, V, E, L, K, T)\ -__flatbuffers_scan_by_field(b, e, A, V, E, L, K, 0, T, __flatbuffers_scalar_cmp) -#define __flatbuffers_scan_by_string_field(b, e, A, V, E, L, K)\ -__flatbuffers_scan_by_field(b, e, A, V, E, L, K, 0, flatbuffers_string_t, __flatbuffers_string_cmp) -#define __flatbuffers_scan_by_string_n_field(b, e, A, V, E, L, K, Kn)\ -__flatbuffers_scan_by_field(b, e, A, V, E, L, K, Kn, flatbuffers_string_t, __flatbuffers_string_n_cmp) -#define __flatbuffers_rscan_by_scalar_field(b, e, A, V, E, L, K, T)\ -__flatbuffers_rscan_by_field(b, e, A, V, E, L, K, 0, T, __flatbuffers_scalar_cmp) -#define __flatbuffers_rscan_by_string_field(b, e, A, V, E, L, K)\ -__flatbuffers_rscan_by_field(b, e, A, V, E, L, K, 0, flatbuffers_string_t, __flatbuffers_string_cmp) -#define __flatbuffers_rscan_by_string_n_field(b, e, A, V, E, L, K, Kn)\ -__flatbuffers_rscan_by_field(b, e, A, V, E, L, K, Kn, flatbuffers_string_t, __flatbuffers_string_n_cmp) -#define __flatbuffers_define_scan_by_scalar_field(N, NK, T)\ -static inline size_t N ## _vec_scan_by_ ## NK(N ## _vec_t vec__tmp, T key__tmp)\ -__flatbuffers_scan_by_scalar_field(0, N ## _vec_len(vec__tmp), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T)\ -static inline size_t N ## _vec_scan_ex_by_ ## NK(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, T key__tmp)\ -__flatbuffers_scan_by_scalar_field(begin__tmp, __flatbuffers_min(end__tmp, N ## _vec_len(vec__tmp)), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T)\ -static inline size_t N ## _vec_rscan_by_ ## NK(N ## _vec_t vec__tmp, T key__tmp)\ -__flatbuffers_rscan_by_scalar_field(0, N ## _vec_len(vec__tmp), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T)\ -static inline size_t N ## _vec_rscan_ex_by_ ## NK(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, T key__tmp)\ -__flatbuffers_rscan_by_scalar_field(begin__tmp, __flatbuffers_min(end__tmp, N ## _vec_len(vec__tmp)), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T) -#define __flatbuffers_define_scalar_scan(N, T)\ -static inline size_t N ## _vec_scan(N ## _vec_t vec__tmp, T key__tmp)\ -__flatbuffers_scan_by_scalar_field(0, N ## _vec_len(vec__tmp), __flatbuffers_identity, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T)\ -static inline size_t N ## _vec_scan_ex(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, T key__tmp)\ -__flatbuffers_scan_by_scalar_field(begin__tmp, __flatbuffers_min(end__tmp, N ## _vec_len(vec__tmp)), __flatbuffers_identity, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T)\ -static inline size_t N ## _vec_rscan(N ## _vec_t vec__tmp, T key__tmp)\ -__flatbuffers_rscan_by_scalar_field(0, N ## _vec_len(vec__tmp), __flatbuffers_identity, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T)\ -static inline size_t N ## _vec_rscan_ex(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, T key__tmp)\ -__flatbuffers_rscan_by_scalar_field(begin__tmp, __flatbuffers_min(end__tmp, N ## _vec_len(vec__tmp)), __flatbuffers_identity, vec__tmp, N ## _vec_at, N ## _vec_len, key__tmp, T) -#define __flatbuffers_define_scan_by_string_field(N, NK) \ -static inline size_t N ## _vec_scan_by_ ## NK(N ## _vec_t vec__tmp, const char *s__tmp)\ -__flatbuffers_scan_by_string_field(0, N ## _vec_len(vec__tmp), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp)\ -static inline size_t N ## _vec_scan_n_by_ ## NK(N ## _vec_t vec__tmp, const char *s__tmp, int n__tmp)\ -__flatbuffers_scan_by_string_n_field(0, N ## _vec_len(vec__tmp), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp, n__tmp)\ -static inline size_t N ## _vec_scan_ex_by_ ## NK(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp)\ -__flatbuffers_scan_by_string_field(begin__tmp, __flatbuffers_min(end__tmp, N ## _vec_len(vec__tmp)), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp)\ -static inline size_t N ## _vec_scan_ex_n_by_ ## NK(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp, int n__tmp)\ -__flatbuffers_scan_by_string_n_field(begin__tmp, __flatbuffers_min( end__tmp, N ## _vec_len(vec__tmp)), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp, n__tmp)\ -static inline size_t N ## _vec_rscan_by_ ## NK(N ## _vec_t vec__tmp, const char *s__tmp)\ -__flatbuffers_rscan_by_string_field(0, N ## _vec_len(vec__tmp), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp)\ -static inline size_t N ## _vec_rscan_n_by_ ## NK(N ## _vec_t vec__tmp, const char *s__tmp, int n__tmp)\ -__flatbuffers_rscan_by_string_n_field(0, N ## _vec_len(vec__tmp), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp, n__tmp)\ -static inline size_t N ## _vec_rscan_ex_by_ ## NK(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp)\ -__flatbuffers_rscan_by_string_field(begin__tmp, __flatbuffers_min(end__tmp, N ## _vec_len(vec__tmp)), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp)\ -static inline size_t N ## _vec_rscan_ex_n_by_ ## NK(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp, int n__tmp)\ -__flatbuffers_rscan_by_string_n_field(begin__tmp, __flatbuffers_min( end__tmp, N ## _vec_len(vec__tmp)), N ## _ ## NK ## _get, vec__tmp, N ## _vec_at, N ## _vec_len, s__tmp, n__tmp) -#define __flatbuffers_define_default_scan_by_scalar_field(N, NK, TK)\ -static inline size_t N ## _vec_scan(N ## _vec_t vec__tmp, TK key__tmp)\ -{ return N ## _vec_scan_by_ ## NK(vec__tmp, key__tmp); }\ -static inline size_t N ## _vec_scan_ex(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, TK key__tmp)\ -{ return N ## _vec_scan_ex_by_ ## NK(vec__tmp, begin__tmp, end__tmp, key__tmp); }\ -static inline size_t N ## _vec_rscan(N ## _vec_t vec__tmp, TK key__tmp)\ -{ return N ## _vec_rscan_by_ ## NK(vec__tmp, key__tmp); }\ -static inline size_t N ## _vec_rscan_ex(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, TK key__tmp)\ -{ return N ## _vec_rscan_ex_by_ ## NK(vec__tmp, begin__tmp, end__tmp, key__tmp); } -#define __flatbuffers_define_default_scan_by_string_field(N, NK) \ -static inline size_t N ## _vec_scan(N ## _vec_t vec__tmp, const char *s__tmp)\ -{ return N ## _vec_scan_by_ ## NK(vec__tmp, s__tmp); }\ -static inline size_t N ## _vec_scan_n(N ## _vec_t vec__tmp, const char *s__tmp, int n__tmp)\ -{ return N ## _vec_scan_n_by_ ## NK(vec__tmp, s__tmp, n__tmp); }\ -static inline size_t N ## _vec_scan_ex(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp)\ -{ return N ## _vec_scan_ex_by_ ## NK(vec__tmp, begin__tmp, end__tmp, s__tmp); }\ -static inline size_t N ## _vec_scan_ex_n(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp, int n__tmp)\ -{ return N ## _vec_scan_ex_n_by_ ## NK(vec__tmp, begin__tmp, end__tmp, s__tmp, n__tmp); }\ -static inline size_t N ## _vec_rscan(N ## _vec_t vec__tmp, const char *s__tmp)\ -{ return N ## _vec_rscan_by_ ## NK(vec__tmp, s__tmp); }\ -static inline size_t N ## _vec_rscan_n(N ## _vec_t vec__tmp, const char *s__tmp, int n__tmp)\ -{ return N ## _vec_rscan_n_by_ ## NK(vec__tmp, s__tmp, n__tmp); }\ -static inline size_t N ## _vec_rscan_ex(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp)\ -{ return N ## _vec_rscan_ex_by_ ## NK(vec__tmp, begin__tmp, end__tmp, s__tmp); }\ -static inline size_t N ## _vec_rscan_ex_n(N ## _vec_t vec__tmp, size_t begin__tmp, size_t end__tmp, const char *s__tmp, int n__tmp)\ -{ return N ## _vec_rscan_ex_n_by_ ## NK(vec__tmp, begin__tmp, end__tmp, s__tmp, n__tmp); } -#define __flatbuffers_heap_sort(N, X, A, E, L, TK, TE, D, S)\ -static inline void __ ## N ## X ## __heap_sift_down(\ - N ## _mutable_vec_t vec__tmp, size_t start__tmp, size_t end__tmp)\ -{ size_t child__tmp, root__tmp; TK v1__tmp, v2__tmp, vroot__tmp;\ - root__tmp = start__tmp;\ - while ((root__tmp << 1) <= end__tmp) {\ - child__tmp = root__tmp << 1;\ - if (child__tmp < end__tmp) {\ - v1__tmp = A(E(vec__tmp, child__tmp));\ - v2__tmp = A(E(vec__tmp, child__tmp + 1));\ - if (D(v1__tmp, v2__tmp) < 0) {\ - child__tmp++;\ - }\ - }\ - vroot__tmp = A(E(vec__tmp, root__tmp));\ - v1__tmp = A(E(vec__tmp, child__tmp));\ - if (D(vroot__tmp, v1__tmp) < 0) {\ - S(vec__tmp, root__tmp, child__tmp, TE);\ - root__tmp = child__tmp;\ - } else {\ - return;\ - }\ - }\ -}\ -static inline void __ ## N ## X ## __heap_sort(N ## _mutable_vec_t vec__tmp)\ -{ size_t start__tmp, end__tmp, size__tmp;\ - size__tmp = L(vec__tmp); if (size__tmp == 0) return; end__tmp = size__tmp - 1; start__tmp = size__tmp >> 1;\ - do { __ ## N ## X ## __heap_sift_down(vec__tmp, start__tmp, end__tmp); } while (start__tmp--);\ - while (end__tmp > 0) { \ - S(vec__tmp, 0, end__tmp, TE);\ - __ ## N ## X ## __heap_sift_down(vec__tmp, 0, --end__tmp); } } -#define __flatbuffers_define_sort_by_field(N, NK, TK, TE, D, S)\ - __flatbuffers_heap_sort(N, _sort_by_ ## NK, N ## _ ## NK ## _get, N ## _vec_at, N ## _vec_len, TK, TE, D, S)\ -static inline void N ## _vec_sort_by_ ## NK(N ## _mutable_vec_t vec__tmp)\ -{ __ ## N ## _sort_by_ ## NK ## __heap_sort(vec__tmp); } -#define __flatbuffers_define_sort(N, TK, TE, D, S)\ -__flatbuffers_heap_sort(N, , __flatbuffers_identity, N ## _vec_at, N ## _vec_len, TK, TE, D, S)\ -static inline void N ## _vec_sort(N ## _mutable_vec_t vec__tmp) { __ ## N ## __heap_sort(vec__tmp); } -#define __flatbuffers_scalar_diff(x, y) ((x) < (y) ? -1 : (x) > (y)) -#define __flatbuffers_string_diff(x, y) __flatbuffers_string_n_cmp((x), (const char *)(y), flatbuffers_string_len(y)) -#define __flatbuffers_scalar_swap(vec, a, b, TE) { TE x__tmp = vec[b]; vec[b] = vec[a]; vec[a] = x__tmp; } -#define __flatbuffers_string_swap(vec, a, b, TE)\ -{ TE ta__tmp, tb__tmp, d__tmp;\ - d__tmp = (TE)((a - b) * sizeof(vec[0]));\ - ta__tmp = __flatbuffers_uoffset_read_from_pe(vec + b) - d__tmp;\ - tb__tmp = __flatbuffers_uoffset_read_from_pe(vec + a) + d__tmp;\ - __flatbuffers_uoffset_write_to_pe(vec + a, ta__tmp);\ - __flatbuffers_uoffset_write_to_pe(vec + b, tb__tmp); } -#define __flatbuffers_define_sort_by_scalar_field(N, NK, TK, TE)\ - __flatbuffers_define_sort_by_field(N, NK, TK, TE, __flatbuffers_scalar_diff, __flatbuffers_scalar_swap) -#define __flatbuffers_define_sort_by_string_field(N, NK)\ - __flatbuffers_define_sort_by_field(N, NK, flatbuffers_string_t, flatbuffers_uoffset_t, __flatbuffers_string_diff, __flatbuffers_string_swap) -#define __flatbuffers_define_scalar_sort(N, T) __flatbuffers_define_sort(N, T, T, __flatbuffers_scalar_diff, __flatbuffers_scalar_swap) -#define __flatbuffers_define_string_sort() __flatbuffers_define_sort(flatbuffers_string, flatbuffers_string_t, flatbuffers_uoffset_t, __flatbuffers_string_diff, __flatbuffers_string_swap) -#define __flatbuffers_define_scalar_vector(N, T)\ -typedef const T *N ## _vec_t;\ -typedef T *N ## _mutable_vec_t;\ -__flatbuffers_define_scalar_vec_len(N)\ -__flatbuffers_define_scalar_vec_at(N, T)\ -__flatbuffers_define_scalar_find(N, T)\ -__flatbuffers_define_scalar_scan(N, T)\ -__flatbuffers_define_scalar_sort(N, T) - -#define __flatbuffers_define_integer_type(N, T, W)\ -__flatcc_define_integer_accessors(N, T, W, flatbuffers_endian)\ -__flatbuffers_define_scalar_vector(N, T) -__flatbuffers_define_scalar_vector(flatbuffers_bool, flatbuffers_bool_t) -__flatbuffers_define_scalar_vector(flatbuffers_uint8, uint8_t) -__flatbuffers_define_scalar_vector(flatbuffers_int8, int8_t) -__flatbuffers_define_scalar_vector(flatbuffers_uint16, uint16_t) -__flatbuffers_define_scalar_vector(flatbuffers_int16, int16_t) -__flatbuffers_define_scalar_vector(flatbuffers_uint32, uint32_t) -__flatbuffers_define_scalar_vector(flatbuffers_int32, int32_t) -__flatbuffers_define_scalar_vector(flatbuffers_uint64, uint64_t) -__flatbuffers_define_scalar_vector(flatbuffers_int64, int64_t) -__flatbuffers_define_scalar_vector(flatbuffers_float, float) -__flatbuffers_define_scalar_vector(flatbuffers_double, double) -__flatbuffers_define_scalar_vector(flatbuffers_union_type, flatbuffers_union_type_t) -static inline size_t flatbuffers_string_vec_find(flatbuffers_string_vec_t vec, const char *s) -__flatbuffers_find_by_string_field(__flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s) -static inline size_t flatbuffers_string_vec_find_n(flatbuffers_string_vec_t vec, const char *s, size_t n) -__flatbuffers_find_by_string_n_field(__flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s, n) -static inline size_t flatbuffers_string_vec_scan(flatbuffers_string_vec_t vec, const char *s) -__flatbuffers_scan_by_string_field(0, flatbuffers_string_vec_len(vec), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s) -static inline size_t flatbuffers_string_vec_scan_n(flatbuffers_string_vec_t vec, const char *s, size_t n) -__flatbuffers_scan_by_string_n_field(0, flatbuffers_string_vec_len(vec), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s, n) -static inline size_t flatbuffers_string_vec_scan_ex(flatbuffers_string_vec_t vec, size_t begin, size_t end, const char *s) -__flatbuffers_scan_by_string_field(begin, __flatbuffers_min(end, flatbuffers_string_vec_len(vec)), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s) -static inline size_t flatbuffers_string_vec_scan_ex_n(flatbuffers_string_vec_t vec, size_t begin, size_t end, const char *s, size_t n) -__flatbuffers_scan_by_string_n_field(begin, __flatbuffers_min(end, flatbuffers_string_vec_len(vec)), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s, n) -static inline size_t flatbuffers_string_vec_rscan(flatbuffers_string_vec_t vec, const char *s) -__flatbuffers_rscan_by_string_field(0, flatbuffers_string_vec_len(vec), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s) -static inline size_t flatbuffers_string_vec_rscan_n(flatbuffers_string_vec_t vec, const char *s, size_t n) -__flatbuffers_rscan_by_string_n_field(0, flatbuffers_string_vec_len(vec), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s, n) -static inline size_t flatbuffers_string_vec_rscan_ex(flatbuffers_string_vec_t vec, size_t begin, size_t end, const char *s) -__flatbuffers_rscan_by_string_field(begin, __flatbuffers_min(end, flatbuffers_string_vec_len(vec)), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s) -static inline size_t flatbuffers_string_vec_rscan_ex_n(flatbuffers_string_vec_t vec, size_t begin, size_t end, const char *s, size_t n) -__flatbuffers_rscan_by_string_n_field(begin, __flatbuffers_min(end, flatbuffers_string_vec_len(vec)), __flatbuffers_identity, vec, flatbuffers_string_vec_at, flatbuffers_string_vec_len, s, n) -__flatbuffers_define_string_sort() -#define __flatbuffers_define_struct_scalar_field(N, NK, TK, T)\ -static inline T N ## _ ## NK ## _get(N ## _struct_t t__tmp)\ -{ return t__tmp ? __flatbuffers_read_scalar(TK, &(t__tmp->NK)) : 0; }\ -static inline const T *N ## _ ## NK ## _get_ptr(N ## _struct_t t__tmp)\ -{ return t__tmp ? &(t__tmp->NK) : 0; }\ -static inline T N ## _ ## NK (N ## _struct_t t__tmp)\ -{ return t__tmp ? __flatbuffers_read_scalar(TK, &(t__tmp->NK)) : 0; }\ -__flatbuffers_define_scan_by_scalar_field(N, NK, T) -#define __flatbuffers_define_struct_struct_field(N, NK, T)\ -static inline T N ## _ ## NK ## _get(N ## _struct_t t__tmp) { return t__tmp ? &(t__tmp->NK) : 0; }\ -static inline T N ## _ ## NK (N ## _struct_t t__tmp) { return t__tmp ? &(t__tmp->NK) : 0; } -/* If fid is null, the function returns true without testing as buffer is not expected to have any id. */ -static inline int flatbuffers_has_identifier(const void *buffer, const char *fid) -{ flatbuffers_thash_t id, id2 = 0; if (fid == 0) { return 1; }; - id2 = flatbuffers_type_hash_from_string(fid); - id = __flatbuffers_thash_read_from_pe(((flatbuffers_uoffset_t *)buffer) + 1); - return id2 == 0 || id == id2; } -static inline int flatbuffers_has_type_hash(const void *buffer, flatbuffers_thash_t thash) -{ return thash == 0 || (__flatbuffers_thash_read_from_pe((flatbuffers_uoffset_t *)buffer + 1) == thash); } - -static inline flatbuffers_thash_t flatbuffers_get_type_hash(const void *buffer) -{ return __flatbuffers_thash_read_from_pe((flatbuffers_uoffset_t *)buffer + 1); } - -#define flatbuffers_verify_endian() flatbuffers_has_identifier("\x00\x00\x00\x00" "1234", "1234") -static inline void *flatbuffers_read_size_prefix(void *b, size_t *size_out) -{ if (size_out) { *size_out = (size_t)__flatbuffers_uoffset_read_from_pe(b); } - return (uint8_t *)b + sizeof(flatbuffers_uoffset_t); } -/* Null file identifier accepts anything, otherwise fid should be 4 characters. */ -#define __flatbuffers_read_root(T, K, buffer, fid)\ - ((!buffer || !flatbuffers_has_identifier(buffer, fid)) ? 0 :\ - ((T ## _ ## K ## t)(((uint8_t *)buffer) +\ - __flatbuffers_uoffset_read_from_pe(buffer)))) -#define __flatbuffers_read_typed_root(T, K, buffer, thash)\ - ((!buffer || !flatbuffers_has_type_hash(buffer, thash)) ? 0 :\ - ((T ## _ ## K ## t)(((uint8_t *)buffer) +\ - __flatbuffers_uoffset_read_from_pe(buffer)))) -#define __flatbuffers_nested_buffer_as_root(C, N, T, K)\ -static inline T ## _ ## K ## t C ## _ ## N ## _as_root_with_identifier(C ## _ ## table_t t__tmp, const char *fid__tmp)\ -{ const uint8_t *buffer__tmp = C ## _ ## N(t__tmp); return __flatbuffers_read_root(T, K, buffer__tmp, fid__tmp); }\ -static inline T ## _ ## K ## t C ## _ ## N ## _as_typed_root(C ## _ ## table_t t__tmp)\ -{ const uint8_t *buffer__tmp = C ## _ ## N(t__tmp); return __flatbuffers_read_root(T, K, buffer__tmp, C ## _ ## type_identifier); }\ -static inline T ## _ ## K ## t C ## _ ## N ## _as_root(C ## _ ## table_t t__tmp)\ -{ const char *fid__tmp = T ## _identifier;\ - const uint8_t *buffer__tmp = C ## _ ## N(t__tmp); return __flatbuffers_read_root(T, K, buffer__tmp, fid__tmp); } -#define __flatbuffers_buffer_as_root(N, K)\ -static inline N ## _ ## K ## t N ## _as_root_with_identifier(const void *buffer__tmp, const char *fid__tmp)\ -{ return __flatbuffers_read_root(N, K, buffer__tmp, fid__tmp); }\ -static inline N ## _ ## K ## t N ## _as_root_with_type_hash(const void *buffer__tmp, flatbuffers_thash_t thash__tmp)\ -{ return __flatbuffers_read_typed_root(N, K, buffer__tmp, thash__tmp); }\ -static inline N ## _ ## K ## t N ## _as_root(const void *buffer__tmp)\ -{ const char *fid__tmp = N ## _identifier;\ - return __flatbuffers_read_root(N, K, buffer__tmp, fid__tmp); }\ -static inline N ## _ ## K ## t N ## _as_typed_root(const void *buffer__tmp)\ -{ return __flatbuffers_read_typed_root(N, K, buffer__tmp, N ## _type_hash); } -#define __flatbuffers_struct_as_root(N) __flatbuffers_buffer_as_root(N, struct_) -#define __flatbuffers_table_as_root(N) __flatbuffers_buffer_as_root(N, table_) - -#include "flatcc/flatcc_epilogue.h" -#endif /* FLATBUFFERS_COMMON_H */ diff --git a/common/src/ini.c b/common/src/ini.c index 1e64632a4..a2865923d 100644 --- a/common/src/ini.c +++ b/common/src/ini.c @@ -17,6 +17,7 @@ Go to the project home page for more info: #include #include "ini.h" +#include "unifyfs_misc.h" #if !INI_USE_STACK #include @@ -73,8 +74,7 @@ static char *find_chars_or_comment(const char *s, const char *chars) /* Version of strncpy that ensures dest (size bytes) is null-terminated. */ static char *strncpy0(char *dest, const char *src, size_t size) { - strncpy(dest, src, size); - dest[size - 1] = '\0'; + strlcpy(dest, src, size); return dest; } diff --git a/common/src/rm_enumerator.c b/common/src/rm_enumerator.c index efd3a8922..d5641a0a2 100644 --- a/common/src/rm_enumerator.c +++ b/common/src/rm_enumerator.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/rm_enumerator.h b/common/src/rm_enumerator.h index 4019a5b55..f8bfddf0b 100644 --- a/common/src/rm_enumerator.h +++ b/common/src/rm_enumerator.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/seg_tree.c b/common/src/seg_tree.c new file mode 100644 index 000000000..c47e3e10d --- /dev/null +++ b/common/src/seg_tree.c @@ -0,0 +1,601 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * This file is a simple, thread-safe, segment tree implementation. The + * segments in the tree are non-overlapping. Added segments overwrite the old + * segments in the tree. This is used to coalesce writes before an fsync. + */ + +#include +#include +#include +#include +#include +#include + +#include "seg_tree.h" +#include "tree.h" +#include "unifyfs_log.h" + +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#ifndef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif + +static int +compare_func(struct seg_tree_node* node1, struct seg_tree_node* node2) +{ + if (node1->start > node2->end) { + return 1; + } else if (node1->end < node2->start) { + return -1; + } else { + return 0; + } +} + +RB_PROTOTYPE(inttree, seg_tree_node, entry, compare_func) +RB_GENERATE(inttree, seg_tree_node, entry, compare_func) + +/* Returns 0 on success, positive non-zero error code otherwise */ +int seg_tree_init(struct seg_tree* seg_tree) +{ + memset(seg_tree, 0, sizeof(*seg_tree)); + pthread_rwlock_init(&seg_tree->rwlock, NULL); + RB_INIT(&seg_tree->head); + + return 0; +} + +/* + * Remove and free all nodes in the seg_tree. + */ +void seg_tree_destroy(struct seg_tree* seg_tree) +{ + seg_tree_clear(seg_tree); +} + +/* Allocate a node for the range tree. Free node with free() when finished */ +static struct seg_tree_node* +seg_tree_node_alloc(unsigned long start, unsigned long end, unsigned long ptr) +{ + /* allocate a new node structure */ + struct seg_tree_node* node; + node = calloc(1, sizeof(*node)); + if (!node) { + return NULL; + } + + /* record logical range and physical offset */ + node->start = start; + node->end = end; + node->ptr = ptr; + + return node; +} + +/* + * Given two start/end ranges, return a new range from start1/end1 that + * does not overlap start2/end2. The non-overlapping range is stored + * in new_start/new_end. If there are no non-overlapping ranges, + * return 1 from this function, else return 0. If there are two + * non-overlapping ranges, return the first one in new_start/new_end. + */ +static int get_non_overlapping_range( + unsigned long start1, unsigned long end1, + long start2, long end2, + long* new_start, long* new_end) +{ + /* + * This function is only called when we know that segment 1 and segment 2 + * overlap with each other. Find first portion of segment 1 that does not + * overlap with segment 2, if any. + */ + if (start1 < start2) { + /* + * Segment 1 includes a portion before segment 2 starts return start/end + * of that leading portion of segment 1. + * + * s1-------e1 + * s2--------e2 + * ---- non-overlap + */ + *new_start = start1; + *new_end = start2 - 1; + return 0; + } else if (end1 > end2) { + /* + * Segment 1 does not start before segment 2, but segment 1 extends past + * end of segment 2. return start/end of trailing portion of segment 1. + * + * s1-----e1 + * s2-------e2 + * --- non-overlap + */ + *new_start = end2 + 1; + *new_end = end1; + return 0; + } + + /* + * Segment 2 completely envelops segment 1 so nothing left of segment 1 to + * return, so return 1 to indicate this case. + * + * s1-------e1 + * s2-------------e2 + */ + return 1; +} + +/* + * Add an entry to the range tree. Returns 0 on success, nonzero otherwise. + */ +int seg_tree_add(struct seg_tree* seg_tree, unsigned long start, + unsigned long end, unsigned long ptr) +{ + /* Assume we'll succeed */ + int rc = 0; + struct seg_tree_node* node; + struct seg_tree_node* remaining; + struct seg_tree_node* resized; + struct seg_tree_node* overlap; + struct seg_tree_node* target; + struct seg_tree_node* prev; + struct seg_tree_node* next; + long new_start; + long new_end; + unsigned long ptr_end; + int ret; + + /* Create our range */ + node = seg_tree_node_alloc(start, end, ptr); + if (!node) { + return ENOMEM; + } + + /* Lock the tree so we can modify it */ + seg_tree_wrlock(seg_tree); + + /* + * Try to insert our range into the RB tree. If it overlaps with any other + * range, then it is not inserted, and the overlapping range node is + * returned in 'overlap'. If 'overlap' is NULL, then there were no + * overlaps, and our range was successfully inserted. + */ + overlap = NULL; + while ((overlap = RB_INSERT(inttree, &seg_tree->head, node))) { + /* + * Our range overlaps with another range (in 'overlap'). Is there any + * any part of 'overlap' that does not overlap our range? If so, + * delete the old 'overlap' and insert the smaller, non-overlapping + * range. + */ + ret = get_non_overlapping_range(overlap->start, overlap->end, start, + end, &new_start, &new_end); + if (ret) { + /* + * The new range we are adding completely covers the existing + * range in the tree defined in overlap. We can't find a + * non-overlapping range. Delete the existing range. + */ + RB_REMOVE(inttree, &seg_tree->head, overlap); + free(overlap); + seg_tree->count--; + } else { + /* + * Part of the old range was non-overlapping. Split the old range + * into two ranges: one for the non-overlapping section, and one for + * the remaining section. The non-overlapping section gets + * inserted without issue. The remaining section will be processed + * on the next pass of this while() loop. + */ + resized = seg_tree_node_alloc(new_start, new_end, + overlap->ptr + (new_start - overlap->start)); + if (!resized) { + free(node); + rc = ENOMEM; + goto release_add; + } + + /* + * If the non-overlapping part came from the front portion of the + * existing range, then there is a trailing portion of the + * existing range to add back to be considered again in the next + * loop iteration. + */ + remaining = NULL; + if (resized->end < overlap->end) { + /* + * There's still a remaining section after the non-overlapping + * part. Add it in. + */ + remaining = seg_tree_node_alloc( + resized->end + 1, overlap->end, + overlap->ptr + (resized->end + 1 - overlap->start)); + if (!remaining) { + free(node); + free(resized); + rc = ENOMEM; + goto release_add; + } + } + + /* Remove our old range */ + RB_REMOVE(inttree, &seg_tree->head, overlap); + free(overlap); + seg_tree->count--; + + /* Insert the non-overlapping part of the new range */ + RB_INSERT(inttree, &seg_tree->head, resized); + seg_tree->count++; + + /* + * If we have a trailing portion, insert range for that, and + * increase our extent count since we just turned one range entry + * into two + */ + if (remaining != NULL) { + RB_INSERT(inttree, &seg_tree->head, remaining); + seg_tree->count++; + } + } + } + + /* Increment segment count in the tree for the range we just added */ + seg_tree->count++; + + /* + * Update max ending offset if end of new range we just inserted + * is larger. + */ + seg_tree->max = MAX(seg_tree->max, end); + + /* Get temporary pointer to the node we just added. */ + target = node; + + /* Check whether we can coalesce new extent with any preceding extent. */ + prev = RB_PREV(inttree, &seg_tree->head, target); + if ((prev != NULL) && ((prev->end + 1) == target->start)) { + /* + * We found a extent that ends just before the new extent starts. + * Check whether they are also contiguous in the log. + */ + ptr_end = prev->ptr + (prev->end - prev->start + 1); + if (ptr_end == target->ptr) { + /* + * The preceding extent describes a log position adjacent to + * the extent we just added, so we can merge them. + * Append entry to previous by extending end of previous. + */ + prev->end = target->end; + + /* Delete new extent from the tree and free it. */ + RB_REMOVE(inttree, &seg_tree->head, target); + free(target); + seg_tree->count--; + + /* + * Update target to point at previous extent since we just + * merged our new extent into it. + */ + target = prev; + } + } + + /* Check whether we can coalesce new extent with any trailing extent. */ + next = RB_NEXT(inttree, &seg_tree->head, target); + if ((next != NULL) && ((target->end + 1) == next->start)) { + /* + * We found a extent that starts just after the new extent ends. + * Check whether they are also contiguous in the log. + */ + ptr_end = target->ptr + (target->end - target->start + 1); + if (ptr_end == next->ptr) { + /* + * The target extent describes a log position adjacent to + * the next extent, so we can merge them. + * Append entry to target by extending end of to cover next. + */ + target->end = next->end; + + /* Delete next extent from the tree and free it. */ + RB_REMOVE(inttree, &seg_tree->head, next); + free(next); + seg_tree->count--; + } + } + +release_add: + + seg_tree_unlock(seg_tree); + + return rc; +} + +/* + * Remove or truncate one or more entries from the range tree + * if they overlap [start, end]. + * + * Returns 0 on success, nonzero otherwise. + */ +int seg_tree_remove( + struct seg_tree* seg_tree, + unsigned long start, + unsigned long end) +{ + struct seg_tree_node* node; + + LOGDBG("removing extents overlapping [%lu, %lu]", start, end); + + seg_tree_wrlock(seg_tree); + node = seg_tree_find_nolock(seg_tree, start, end); + while (node != NULL) { + if (start <= node->start) { + if (node->end <= end) { + /* start <= node_s <= node_e <= end + * remove whole extent */ + LOGDBG("removing node [%lu, %lu]", node->start, node->end); + RB_REMOVE(inttree, &seg_tree->head, node); + free(node); + seg_tree->count--; + } else { + /* start <= node_s <= end < node_e + * update node start */ + LOGDBG("updating node start from %lu to %lu", + node->start, (end + 1)); + + node->ptr += (end + 1 - node->start); + node->start = end + 1; + } + } else if (node->start < start) { + if (node->end <= end) { + /* node_s < start <= node_e <= end + * truncate node */ + LOGDBG("updating node end from %lu to %lu", + node->end, (start - 1)); + node->end = start - 1; + } else { + /* node_s < start <= end < node_e + * extent spans entire region, split into two nodes + * representing before/after region */ + unsigned long a_end = node->end; + unsigned long a_start = end + 1; + unsigned long a_ptr = node->ptr + (a_start - node->start); + + /* truncate existing (before) node */ + LOGDBG("updating before node end from %lu to %lu", + node->end, (start - 1)); + node->end = start - 1; + + /* add new (after) node */ + LOGDBG("add after node [%lu, %lu]", a_start, a_end); + seg_tree_unlock(seg_tree); + int rc = seg_tree_add(seg_tree, a_start, a_end, a_ptr); + if (rc) { + LOGERR("seg_tree_add() failed when splitting"); + return rc; + } + seg_tree_wrlock(seg_tree); + } + } + /* keep looking for nodes that overlap target region */ + node = seg_tree_find_nolock(seg_tree, start, end); + } + seg_tree_unlock(seg_tree); + + return 0; +} + +/* + * Search tree for an entry that overlaps with given range of [start, end]. + * Returns the first overlapping entry if found, which is the overlapping entry + * having the lowest starting offset, and returns NULL otherwise. + * + * This function assumes you've already locked the seg_tree. + */ +struct seg_tree_node* seg_tree_find_nolock( + struct seg_tree* seg_tree, + unsigned long start, + unsigned long end) +{ + /* Create a range of just our starting byte offset */ + struct seg_tree_node* node = seg_tree_node_alloc(start, start, 0); + if (!node) { + return NULL; + } + + /* Search tree for either a range that overlaps with + * the target range (starting byte), or otherwise the + * node for the next biggest starting byte. */ + struct seg_tree_node* next = RB_NFIND(inttree, &seg_tree->head, node); + + free(node); + + /* We may have found a node that doesn't include our starting + * byte offset, but it would be the range with the lowest + * starting offset after the target starting offset. Check whether + * this overlaps our end offset */ + if (next && next->start <= end) { + return next; + } + + /* Otherwise, there is not element that overlaps with the + * target range of [start, end]. */ + return NULL; +} + +/* + * Search tree for an entry that overlaps with given range of [start, end]. + * Returns the first overlapping entry if found, which is the overlapping entry + * having the lowest starting offset, and returns NULL otherwise. + */ +struct seg_tree_node* seg_tree_find( + struct seg_tree* seg_tree, + unsigned long start, + unsigned long end) +{ + struct seg_tree_node* node; + + seg_tree_rdlock(seg_tree); + node = seg_tree_find_nolock(seg_tree, start, end); + seg_tree_unlock(seg_tree); + + return node; +} + +/* + * Given a range tree and a starting node, iterate though all the nodes + * in the tree, returning the next one each time. If start is NULL, then + * start with the first node in the tree. + * + * This is meant to be called in a loop, like: + * + * seg_tree_rdlock(seg_tree); + * + * struct seg_tree_node *node = NULL; + * while ((node = seg_tree_iter(seg_tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * seg_tree_unlock(seg_tree); + * + * Note: this function does no locking, and assumes you're properly locking + * and unlocking the seg_tree before doing the iteration (see + * seg_tree_rdlock()/seg_tree_wrlock()/seg_tree_unlock()). + */ +struct seg_tree_node* +seg_tree_iter(struct seg_tree* seg_tree, struct seg_tree_node* start) +{ + struct seg_tree_node* next = NULL; + struct seg_tree_node* tmp = NULL; + if (start == NULL) { + /* Initial case, no starting node */ + next = RB_MIN(inttree, &seg_tree->head); + return next; + } + + /* + * We were given a valid start node. Look it up to start our traversal + * from there. + */ + tmp = RB_FIND(inttree, &seg_tree->head, start); + if (!tmp) { + /* Some kind of error */ + return NULL; + } + + /* Look up our next node */ + next = RB_NEXT(inttree, &seg_tree->head, start); + + return next; +} + +/* + * Lock a seg_tree for reading. This should only be used for calling + * seg_tree_iter(). All the other seg_tree functions provide their + * own locking. + */ +void +seg_tree_rdlock(struct seg_tree* seg_tree) +{ + int rc = pthread_rwlock_rdlock(&seg_tree->rwlock); + if (rc) { + LOGERR("pthread_rwlock_rdlock() failed - rc=%d", rc); + } +} + +/* + * Lock a seg_tree for read/write. This should only be used for calling + * seg_tree_iter(). All the other seg_tree functions provide their + * own locking. + */ +void +seg_tree_wrlock(struct seg_tree* seg_tree) +{ + int rc = pthread_rwlock_wrlock(&seg_tree->rwlock); + if (rc) { + LOGERR("pthread_rwlock_wrlock() failed - rc=%d", rc); + } +} + +/* + * Unlock a seg_tree for read/write. This should only be used for calling + * seg_tree_iter(). All the other seg_tree functions provide their + * own locking. + */ +void +seg_tree_unlock(struct seg_tree* seg_tree) +{ + int rc = pthread_rwlock_unlock(&seg_tree->rwlock); + if (rc) { + LOGERR("pthread_rwlock_unlock() failed - rc=%d", rc); + } +} + +/* + * Remove all nodes in seg_tree, but keep it initialized so you can + * seg_tree_add() to it. + */ +void seg_tree_clear(struct seg_tree* seg_tree) +{ + struct seg_tree_node* node = NULL; + struct seg_tree_node* oldnode = NULL; + + seg_tree_wrlock(seg_tree); + + if (RB_EMPTY(&seg_tree->head)) { + /* seg_tree is empty, nothing to do */ + seg_tree_unlock(seg_tree); + return; + } + + /* Remove and free each node in the tree */ + while ((node = seg_tree_iter(seg_tree, node))) { + if (oldnode) { + RB_REMOVE(inttree, &seg_tree->head, oldnode); + free(oldnode); + } + oldnode = node; + } + if (oldnode) { + RB_REMOVE(inttree, &seg_tree->head, oldnode); + free(oldnode); + } + + seg_tree->count = 0; + seg_tree->max = 0; + seg_tree_unlock(seg_tree); +} + +/* Return the number of segments in the segment tree */ +unsigned long seg_tree_count(struct seg_tree* seg_tree) +{ + seg_tree_rdlock(seg_tree); + unsigned long count = seg_tree->count; + seg_tree_unlock(seg_tree); + return count; +} + +/* Return the maximum ending logical offset in the tree */ +unsigned long seg_tree_max(struct seg_tree* seg_tree) +{ + seg_tree_rdlock(seg_tree); + unsigned long max = seg_tree->max; + seg_tree_unlock(seg_tree); + return max; +} diff --git a/common/src/seg_tree.h b/common/src/seg_tree.h new file mode 100644 index 000000000..a29416036 --- /dev/null +++ b/common/src/seg_tree.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __SEG_TREE_H__ +#define __SEG_TREE_H__ + +#include +#include "tree.h" + +struct seg_tree_node { + RB_ENTRY(seg_tree_node) entry; + unsigned long start; /* starting logical offset of range */ + unsigned long end; /* ending logical offset of range */ + unsigned long ptr; /* physical offset of data in log */ +}; + +struct seg_tree { + RB_HEAD(inttree, seg_tree_node) head; + pthread_rwlock_t rwlock; + unsigned long count; /* number of segments stored in tree */ + unsigned long max; /* maximum logical offset value in the tree */ +}; + +/* Returns 0 on success, positive non-zero error code otherwise */ +int seg_tree_init(struct seg_tree* seg_tree); + +/* + * Remove all nodes in seg_tree, but keep it initialized so you can + * seg_tree_add() to it. + */ +void seg_tree_clear(struct seg_tree* seg_tree); + +/* + * Remove and free all nodes in the seg_tree. + */ +void seg_tree_destroy(struct seg_tree* seg_tree); + +/* + * Add an entry to the range tree. Returns 0 on success, nonzero otherwise. + */ +int seg_tree_add(struct seg_tree* seg_tree, unsigned long start, + unsigned long end, unsigned long ptr); + +/* + * Remove or truncate one or more entries from the range tree + * if they overlap [start, end]. + * + * Returns 0 on success, nonzero otherwise. + */ +int seg_tree_remove( + struct seg_tree* seg_tree, + unsigned long start, + unsigned long end +); + +/* + * Find the first seg_tree_node that falls in a [start, end] range. + */ +struct seg_tree_node* seg_tree_find( + struct seg_tree* seg_tree, + unsigned long start, + unsigned long end +); + +/* + * Find the first seg_tree_node that falls in a [start, end] range. + * Assumes you've already locked the tree. + */ +struct seg_tree_node* seg_tree_find_nolock( + struct seg_tree* seg_tree, + unsigned long start, + unsigned long end +); + +/* + * Given a range tree and a starting node, iterate though all the nodes + * in the tree, returning the next one each time. If start is NULL, then + * start with the first node in the tree. + * + * This is meant to be called in a loop, like: + * + * seg_tree_rdlock(seg_tree); + * + * struct seg_tree_node *node = NULL; + * while ((node = seg_tree_iter(seg_tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * seg_tree_unlock(seg_tree); + * + * Note: this function does no locking, and assumes you're properly locking + * and unlocking the seg_tree before doing the iteration (see + * seg_tree_rdlock()/seg_tree_wrlock()/seg_tree_unlock()). + */ +struct seg_tree_node* seg_tree_iter(struct seg_tree* seg_tree, + struct seg_tree_node* start); + +/* Return the number of segments in the segment tree */ +unsigned long seg_tree_count(struct seg_tree* seg_tree); + +/* Return the maximum ending logical offset in the tree */ +unsigned long seg_tree_max(struct seg_tree* seg_tree); + +/* + * Locking functions for use with seg_tree_iter(). They allow you to lock the + * tree to iterate over it: + * + * seg_tree_rdlock(&seg_tree); + * + * struct seg_tree_node *node = NULL; + * while ((node = seg_tree_iter(seg_tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * seg_tree_unlock(&seg_tree); + */ + +/* + * Lock a seg_tree for reading. This should only be used for calling + * seg_tree_iter(). All the other seg_tree functions provide their + * own locking. + */ +void seg_tree_rdlock(struct seg_tree* seg_tree); + +/* + * Lock a seg_tree for read/write. This should only be used for calling + * seg_tree_iter(). All the other seg_tree functions provide their + * own locking. + */ +void seg_tree_wrlock(struct seg_tree* seg_tree); + +/* + * Unlock a seg_tree for read/write. This should only be used for calling + * seg_tree_iter(). All the other seg_tree functions provide their + * own locking. + */ +void seg_tree_unlock(struct seg_tree* seg_tree); + +#endif diff --git a/common/src/slotmap.c b/common/src/slotmap.c new file mode 100644 index 000000000..408a36b3b --- /dev/null +++ b/common/src/slotmap.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_const.h" +#include "slotmap.h" + +#include +#include // bool +#include // uint8_t +#include +#include // NULL +#include // memset() + + +/* Bit-twiddling convenience macros */ +#define SLOT_BYTE(slot) ((slot) >> 3) +#define SLOT_BIT(slot) ((slot) & 0x7) +#define BYTE_BIT_TO_SLOT(byte, bit) (((byte) * 8) + (bit)) +#define BYTE_VAL_BIT(byte_val, bit) ((byte_val) & (uint8_t)(1 << (bit))) + +/* Set given byte-bit in use map */ +static inline +void set_usemap_byte_bit(uint8_t* usemap, size_t byte, int bit) +{ + uint8_t byte_val = usemap[byte]; + byte_val |= (uint8_t)(1 << bit); + usemap[byte] = byte_val; +} + +/* Clear given byte-bit in use map */ +static inline +void clear_usemap_byte_bit(uint8_t* usemap, size_t byte, int bit) +{ + uint8_t byte_val = usemap[byte]; + uint8_t byte_mask = (uint8_t)0xFF - (uint8_t)(1 << bit); + byte_val &= byte_mask; + usemap[byte] = byte_val; +} + +/* Check use map for slot used */ +static inline +int check_slot(uint8_t* usemap, size_t slot) +{ + size_t byte = SLOT_BYTE(slot); + int bit = SLOT_BIT(slot); + uint8_t byte_val = usemap[byte]; + if (BYTE_VAL_BIT(byte_val, bit)) { + return 1; + } + return 0; +} + +/* Update use map to use slot */ +static inline +void use_slot(uint8_t* usemap, size_t slot) +{ + size_t byte = SLOT_BYTE(slot); + int bit = SLOT_BIT(slot); + set_usemap_byte_bit(usemap, byte, bit); +} + +/* Update use map to release slot */ +static inline +void release_slot(uint8_t* usemap, size_t slot) +{ + size_t byte = SLOT_BYTE(slot); + int bit = SLOT_BIT(slot); + clear_usemap_byte_bit(usemap, byte, bit); +} + +/* Return bytes necessary to hold use map for given number of slots */ +static inline +size_t slot_map_bytes(size_t total_slots) +{ + size_t map_bytes = SLOT_BYTE(total_slots); + if (SLOT_BIT(total_slots)) { + map_bytes++; + } + return map_bytes; +} + +/* Slot usage bitmap immediately follows the structure in memory. + * The usage bitmap can be thought of as an uint8_t array, where + * each uint8_t represents 8 slots. + * uint8_t use_bitmap[total_slots/8] + */ +static inline +uint8_t* get_use_map(slot_map* smap) +{ + uint8_t* usemap = (uint8_t*)((char*)smap + sizeof(slot_map)); + return usemap; +} + +/* Return number of free slots */ +static inline +size_t get_free_slots(slot_map* smap) +{ + return smap->total_slots - smap->used_slots; +} + +/** + * Initialize a slot map within the given memory region, and return a pointer + * to the slot_map structure. Returns NULL if the provided memory region is not + * large enough to track the requested number of slots. + * + * @param num_slots number of slots to track + * @param region_addr address of the memory region + * @param region_sz size of the memory region + * + * @return valid slot_map pointer, or NULL on error + */ +slot_map* slotmap_init(size_t num_slots, + void* region_addr, + size_t region_sz) +{ + if (NULL == region_addr) { + return NULL; + } + + size_t avail_use_bytes = region_sz - sizeof(slot_map); + size_t needed_use_bytes = slot_map_bytes(num_slots); + if (needed_use_bytes > avail_use_bytes) { + /* not enough space for use map */ + return NULL; + } + + slot_map* smap = (slot_map*) region_addr; + smap->total_slots = num_slots; + slotmap_clear(smap); + + return smap; +} + +/** + * Clear the given slot_map. Marks all slots free. + * + * @param smap valid slot_map pointer + * + * @return UNIFYFS_SUCCESS, or error code + */ +int slotmap_clear(slot_map* smap) +{ + if (NULL == smap) { + return EINVAL; + } + + /* set used to zero */ + smap->used_slots = 0; + + /* zero-out use map */ + uint8_t* usemap = get_use_map(smap); + memset((void*)usemap, 0, slot_map_bytes(smap->total_slots)); + + return UNIFYFS_SUCCESS; +} + +static inline +size_t find_consecutive_zero_bits(uint8_t byte_val, + size_t num_bits, + int* start_bit) +{ + size_t max_free = 0; + int free_start = 0; + for (int bit = 0; bit < 8; bit++) { + /* starting at current bit, count consecutive zero bits */ + int run_start = bit; + size_t run_max = 0; + while (!BYTE_VAL_BIT(byte_val, bit)) { + run_max++; + bit++; + if (bit == 8) { + break; + } + } + if (run_max > max_free) { + max_free = run_max; + free_start = run_start; + } + } + if (NULL != start_bit) { + if (max_free >= num_bits) { + *start_bit = free_start; + } else { + *start_bit = -1; /* did not find enough bits */ + } + } + return max_free; +} + +/** + * Reserve consecutive slots in the slot_map. + * + * @param smap valid slot_map pointer + * @param num_slots number of slots to reserve + * + * @return starting slot index of reservation, or -1 on error + */ +ssize_t slotmap_reserve(slot_map* smap, + size_t num_slots) +{ + if ((NULL == smap) || (0 == num_slots)) { + return (ssize_t)-1; + } + + size_t free = get_free_slots(smap); + if (free < num_slots) { + /* not enough free slots available */ + return (ssize_t)-1; + } + + /* need this many usemap bytes for requested slots */ + size_t slot_bytes = slot_map_bytes(num_slots); + + /* these will be set if we find a spot for the reservation */ + size_t start_slot; + int found_start = 0; + + /* search for contiguous free slots */ + size_t search_start = 0; + if (slot_bytes > 1) { + /* skip past (likely) used slots */ + search_start = SLOT_BYTE(smap->used_slots); + } + uint8_t* usemap = get_use_map(smap); + size_t map_bytes = slot_map_bytes(smap->total_slots); + for (size_t byte_ndx = search_start; byte_ndx < map_bytes; byte_ndx++) { + uint8_t byte_val = usemap[byte_ndx]; + if (byte_val == UINT8_MAX) { + /* current byte is completely occupied */ + continue; + } else if ((slot_bytes > 1) && + ((byte_ndx + slot_bytes) <= map_bytes)) { + /* look for slot_bytes consecutive zero bytes */ + size_t run_start = byte_ndx; + size_t run_count = 0; + while (0 == usemap[byte_ndx]) { + run_count++; + byte_ndx++; + if (run_count == slot_bytes) { + /* success */ + start_slot = BYTE_BIT_TO_SLOT(run_start, 0); + found_start = 1; + break; + } + } + } else { + /* need at most 8 bits, spanning at most two bytes */ + int bit_in_byte; + size_t free_bits = find_consecutive_zero_bits(byte_val, num_slots, + &bit_in_byte); + if (free_bits >= num_slots) { + /* success, can reserve all slots in this byte of use map */ + assert(bit_in_byte != -1); + start_slot = BYTE_BIT_TO_SLOT(byte_ndx, bit_in_byte); + found_start = 1; + } else if ((byte_ndx + 1) < map_bytes) { + /* check if free bits are at the end of the byte */ + find_consecutive_zero_bits(byte_val, free_bits, &bit_in_byte); + assert(bit_in_byte != -1); + if (bit_in_byte == (8 - free_bits)) { + /* free bits are at tail end of byte, + * check next byte for remaining needed slots */ + int bit_in_byte2; + size_t have_bits = free_bits; + size_t need_bits = num_slots - have_bits; + byte_val = usemap[byte_ndx + 1]; + free_bits = find_consecutive_zero_bits(byte_val, need_bits, + &bit_in_byte2); + if ((free_bits >= need_bits) && (bit_in_byte2 == 0)) { + /* success, has enough free bits at start of byte */ + start_slot = BYTE_BIT_TO_SLOT(byte_ndx, bit_in_byte); + found_start = 1; + } + } + } + } + if (found_start) { + break; + } + } + + if (found_start) { + /* success, reserve bits in consecutive slots */ + for (size_t i = 0; i < num_slots; i++) { + use_slot(usemap, start_slot + i); + } + smap->used_slots += num_slots; + return (ssize_t)start_slot; + } + + /* did not find enough consecutive free slots */ + return (ssize_t)-1; +} + +/** + * Release consecutive slots in the slot_map. + * + * @param smap valid slot_map pointer + * @param start_index starting slot index + * @param num_slots number of slots to release + * + * @return UNIFYFS_SUCCESS, or error code + */ +int slotmap_release(slot_map* smap, + size_t start_index, + size_t num_slots) +{ + if (NULL == smap) { + return EINVAL; + } + + uint8_t* usemap = get_use_map(smap); + + /* make sure first bit at start slot index is actually set */ + if (!check_slot(usemap, start_index)) { + return EINVAL; + } + + /* release the slots */ + for (size_t i = 0; i < num_slots; i++) { + release_slot(usemap, start_index + i); + } + smap->used_slots -= num_slots; + + return UNIFYFS_SUCCESS; +} + +/** + * Print the slot_map to stderr (for debugging). + * + * @param smap valid slot_map pointer + */ +void slotmap_print(slot_map* smap) +{ + if (NULL == smap) { + return; + } + + uint8_t* usemap = get_use_map(smap); + + /* the '#' at the beginning of the lines is for compatibility with TAP */ + fprintf(stderr, "# Slot Map:\n"); + fprintf(stderr, "# total slots - %zu\n", smap->total_slots); + fprintf(stderr, "# used slots - %zu\n", smap->used_slots); + + for (size_t i = 0; i < smap->total_slots; i++) { + if (i % 64 == 0) { + fprintf(stderr, "\n# %8zu : ", i); + } else if (i % 8 == 0) { + fprintf(stderr, " "); + } + int bitval = check_slot(usemap, i); + fprintf(stderr, "%d", bitval); + } + fprintf(stderr, "\n#\n"); +} + diff --git a/common/src/slotmap.h b/common/src/slotmap.h new file mode 100644 index 000000000..ea937ff91 --- /dev/null +++ b/common/src/slotmap.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef SLOTMAP_H +#define SLOTMAP_H + +#include // size_t, ssize_t + +#ifdef __cplusplus +extern "C" { +#endif + +/* slot map, a simple structure that manages a bitmap of used/free slots */ +typedef struct slot_map { + size_t total_slots; + size_t used_slots; +} slot_map; + +/* The slot usage bitmap immediately follows the structure in memory. + * The usage bitmap can be thought of as an uint8_t array, where + * each uint8_t represents 8 slots. + * uint8_t use_bitmap[total_slots/8] + */ + +/** + * Initialize a slot map within the given memory region, and return a pointer + * to the slot_map structure. Returns NULL if the provided memory region is not + * large enough to track the requested number of slots. + * + * @param num_slots number of slots to track + * @param region_addr address of the memory region + * @param region_sz size of the memory region + * + * @return valid slot_map pointer, or NULL on error + */ +slot_map* slotmap_init(size_t num_slots, + void* region_addr, + size_t region_sz); + +/** + * Clear the given slot_map. Marks all slots free. + * + * @param smap valid slot_map pointer + * + * @return UNIFYFS_SUCCESS, or error code + */ +int slotmap_clear(slot_map* smap); + +/** + * Reserve consecutive slots in the slot_map. + * + * @param smap valid slot_map pointer + * @param num_slots number of slots to reserve + * + * @return starting slot index of reservation, or -1 on error + */ +ssize_t slotmap_reserve(slot_map* smap, + size_t num_slots); + +/** + * Release consecutive slots in the slot_map. + * + * @param smap valid slot_map pointer + * @param start_index starting slot index + * @param num_slots number of slots to release + * + * @return UNIFYFS_SUCCESS, or error code + */ +int slotmap_release(slot_map* smap, + size_t start_index, + size_t num_slots); + +/** + * Print the slot_map (for debugging). + * + * @param smap valid slot_map pointer + */ +void slotmap_print(slot_map* smap); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // UNIFYFS_BITMAP_H + diff --git a/common/src/tree.h b/common/src/tree.h new file mode 100644 index 000000000..fb190e7e2 --- /dev/null +++ b/common/src/tree.h @@ -0,0 +1,1007 @@ +/* $OpenBSD: tree.h,v 1.29 2017/07/30 19:27:20 deraadt Exp $ */ +/* + * Copyright 2002 Niels Provos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_TREE_H_ +#define _SYS_TREE_H_ + +/* OpenBSD only #include - commenting it out */ +/* #include */ + +/* + * This file defines data structures for different types of trees: + * splay trees and red-black trees. + * + * A splay tree is a self-organizing data structure. Every operation + * on the tree causes a splay to happen. The splay moves the requested + * node to the root of the tree and partly rebalances it. + * + * This has the benefit that request locality causes faster lookups as + * the requested nodes move to the top of the tree. On the other hand, + * every lookup causes memory writes. + * + * The Balance Theorem bounds the total access time for m operations + * and n inserts on an initially empty tree as O((m + n)lg n). The + * amortized cost for a sequence of m accesses to a splay tree is O(lg n); + * + * A red-black tree is a binary search tree with the node color as an + * extra attribute. It fulfills a set of conditions: + * - every search path from the root to a leaf consists of the + * same number of black nodes, + * - each red node (except for the root) has a black parent, + * - each leaf node is black. + * + * Every operation on a red-black tree is bounded as O(lg n). + * The maximum height of a red-black tree is 2lg (n+1). + */ + +#define SPLAY_HEAD(name, type) \ +struct name { \ + struct type *sph_root; /* root of the tree */ \ +} + +#define SPLAY_INITIALIZER(root) \ + { NULL } + +#define SPLAY_INIT(root) do { \ + (root)->sph_root = NULL; \ +} while (0) + +#define SPLAY_ENTRY(type) \ +struct { \ + struct type *spe_left; /* left element */ \ + struct type *spe_right; /* right element */ \ +} + +#define SPLAY_LEFT(elm, field) (elm)->field.spe_left +#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right +#define SPLAY_ROOT(head) (head)->sph_root +#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) + +/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ +#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (0) + +#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + (head)->sph_root = tmp; \ +} while (0) + +#define SPLAY_LINKLEFT(head, tmp, field) do { \ + SPLAY_LEFT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ +} while (0) + +#define SPLAY_LINKRIGHT(head, tmp, field) do { \ + SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ + tmp = (head)->sph_root; \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ +} while (0) + +#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ + SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ + SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ + SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ +} while (0) + +/* Generates prototypes and inline functions */ + +#define SPLAY_PROTOTYPE(name, type, field, cmp) \ +void name##_SPLAY(struct name *, struct type *); \ +void name##_SPLAY_MINMAX(struct name *, int); \ +struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ +struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ + \ +/* Finds the node with the same key as elm */ \ +static __unused __inline struct type * \ +name##_SPLAY_FIND(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) \ + return(NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) \ + return (head->sph_root); \ + return (NULL); \ +} \ + \ +static __unused __inline struct type * \ +name##_SPLAY_NEXT(struct name *head, struct type *elm) \ +{ \ + name##_SPLAY(head, elm); \ + if (SPLAY_RIGHT(elm, field) != NULL) { \ + elm = SPLAY_RIGHT(elm, field); \ + while (SPLAY_LEFT(elm, field) != NULL) { \ + elm = SPLAY_LEFT(elm, field); \ + } \ + } else \ + elm = NULL; \ + return (elm); \ +} \ + \ +static __unused __inline struct type * \ +name##_SPLAY_MIN_MAX(struct name *head, int val) \ +{ \ + name##_SPLAY_MINMAX(head, val); \ + return (SPLAY_ROOT(head)); \ +} + +/* Main splay operation. + * Moves node close to the key of elm to top + */ +#define SPLAY_GENERATE(name, type, field, cmp) \ +struct type * \ +name##_SPLAY_INSERT(struct name *head, struct type *elm) \ +{ \ + if (SPLAY_EMPTY(head)) { \ + SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ + } else { \ + int __comp; \ + name##_SPLAY(head, elm); \ + __comp = (cmp)(elm, (head)->sph_root); \ + if(__comp < 0) { \ + SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ + SPLAY_RIGHT(elm, field) = (head)->sph_root; \ + SPLAY_LEFT((head)->sph_root, field) = NULL; \ + } else if (__comp > 0) { \ + SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ + SPLAY_LEFT(elm, field) = (head)->sph_root; \ + SPLAY_RIGHT((head)->sph_root, field) = NULL; \ + } else \ + return ((head)->sph_root); \ + } \ + (head)->sph_root = (elm); \ + return (NULL); \ +} \ + \ +struct type * \ +name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *__tmp; \ + if (SPLAY_EMPTY(head)) \ + return (NULL); \ + name##_SPLAY(head, elm); \ + if ((cmp)(elm, (head)->sph_root) == 0) { \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ + (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ + } else { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ + name##_SPLAY(head, elm); \ + SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ + } \ + return (elm); \ + } \ + return (NULL); \ +} \ + \ +void \ +name##_SPLAY(struct name *head, struct type *elm) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ + int __comp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while ((__comp = (cmp)(elm, (head)->sph_root))) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if ((cmp)(elm, __tmp) > 0){ \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} \ + \ +/* Splay with either the minimum or the maximum element \ + * Used to find minimum or maximum element in tree. \ + */ \ +void name##_SPLAY_MINMAX(struct name *head, int __comp) \ +{ \ + struct type __node, *__left, *__right, *__tmp; \ +\ + SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ + __left = __right = &__node; \ +\ + while (1) { \ + if (__comp < 0) { \ + __tmp = SPLAY_LEFT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp < 0){ \ + SPLAY_ROTATE_RIGHT(head, __tmp, field); \ + if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKLEFT(head, __right, field); \ + } else if (__comp > 0) { \ + __tmp = SPLAY_RIGHT((head)->sph_root, field); \ + if (__tmp == NULL) \ + break; \ + if (__comp > 0) { \ + SPLAY_ROTATE_LEFT(head, __tmp, field); \ + if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ + break; \ + } \ + SPLAY_LINKRIGHT(head, __left, field); \ + } \ + } \ + SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ +} + +#define SPLAY_NEGINF -1 +#define SPLAY_INF 1 + +#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) +#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) +#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) +#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) +#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) +#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ + : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) + +#define SPLAY_FOREACH(x, name, head) \ + for ((x) = SPLAY_MIN(name, head); \ + (x) != NULL; \ + (x) = SPLAY_NEXT(name, head, x)) + +/* Macros that define a red-black tree */ +#define RB_HEAD(name, type) \ +struct name { \ + struct type *rbh_root; /* root of the tree */ \ +} + +#define RB_INITIALIZER(root) \ + { NULL } + +#define RB_INIT(root) do { \ + (root)->rbh_root = NULL; \ +} while (0) + +#define RB_BLACK 0 +#define RB_RED 1 +#define RB_ENTRY(type) \ +struct { \ + struct type *rbe_left; /* left element */ \ + struct type *rbe_right; /* right element */ \ + struct type *rbe_parent; /* parent element */ \ + int rbe_color; /* node color */ \ +} + +#define RB_LEFT(elm, field) (elm)->field.rbe_left +#define RB_RIGHT(elm, field) (elm)->field.rbe_right +#define RB_PARENT(elm, field) (elm)->field.rbe_parent +#define RB_COLOR(elm, field) (elm)->field.rbe_color +#define RB_ROOT(head) (head)->rbh_root +#define RB_EMPTY(head) (RB_ROOT(head) == NULL) + +#define RB_SET(elm, parent, field) do { \ + RB_PARENT(elm, field) = parent; \ + RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ + RB_COLOR(elm, field) = RB_RED; \ +} while (0) + +#define RB_SET_BLACKRED(black, red, field) do { \ + RB_COLOR(black, field) = RB_BLACK; \ + RB_COLOR(red, field) = RB_RED; \ +} while (0) + +#ifndef RB_AUGMENT +#define RB_AUGMENT(x) do {} while (0) +#endif + +#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ + (tmp) = RB_RIGHT(elm, field); \ + if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field))) { \ + RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ + } \ + RB_AUGMENT(elm); \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_LEFT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ + RB_AUGMENT(tmp); \ + if ((RB_PARENT(tmp, field))) \ + RB_AUGMENT(RB_PARENT(tmp, field)); \ +} while (0) + +#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ + (tmp) = RB_LEFT(elm, field); \ + if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field))) { \ + RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ + } \ + RB_AUGMENT(elm); \ + if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ + if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ + RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ + else \ + RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ + } else \ + (head)->rbh_root = (tmp); \ + RB_RIGHT(tmp, field) = (elm); \ + RB_PARENT(elm, field) = (tmp); \ + RB_AUGMENT(tmp); \ + if ((RB_PARENT(tmp, field))) \ + RB_AUGMENT(RB_PARENT(tmp, field)); \ +} while (0) + +/* Generates prototypes and inline functions */ +#define RB_PROTOTYPE(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) +#define RB_PROTOTYPE_STATIC(name, type, field, cmp) \ + RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static) +#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ +attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \ +attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ +attr struct type *name##_RB_REMOVE(struct name *, struct type *); \ +attr struct type *name##_RB_INSERT(struct name *, struct type *); \ +attr struct type *name##_RB_FIND(struct name *, struct type *); \ +attr struct type *name##_RB_NFIND(struct name *, struct type *); \ +attr struct type *name##_RB_NEXT(struct type *); \ +attr struct type *name##_RB_PREV(struct type *); \ +attr struct type *name##_RB_MINMAX(struct name *, int); \ + \ + +/* Main rb operation. + * Moves node close to the key of elm to top + */ +#define RB_GENERATE(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp,) +#define RB_GENERATE_STATIC(name, type, field, cmp) \ + RB_GENERATE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static) +#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ +attr void \ +name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ +{ \ + struct type *parent, *gparent, *tmp; \ + while ((parent = RB_PARENT(elm, field)) && \ + RB_COLOR(parent, field) == RB_RED) { \ + gparent = RB_PARENT(parent, field); \ + if (parent == RB_LEFT(gparent, field)) { \ + tmp = RB_RIGHT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_RIGHT(parent, field) == elm) { \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_RIGHT(head, gparent, tmp, field); \ + } else { \ + tmp = RB_LEFT(gparent, field); \ + if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ + RB_COLOR(tmp, field) = RB_BLACK; \ + RB_SET_BLACKRED(parent, gparent, field);\ + elm = gparent; \ + continue; \ + } \ + if (RB_LEFT(parent, field) == elm) { \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = parent; \ + parent = elm; \ + elm = tmp; \ + } \ + RB_SET_BLACKRED(parent, gparent, field); \ + RB_ROTATE_LEFT(head, gparent, tmp, field); \ + } \ + } \ + RB_COLOR(head->rbh_root, field) = RB_BLACK; \ +} \ + \ +attr void \ +name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ +{ \ + struct type *tmp; \ + while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ + elm != RB_ROOT(head)) { \ + if (RB_LEFT(parent, field) == elm) { \ + tmp = RB_RIGHT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ + struct type *oleft; \ + if ((oleft = RB_LEFT(tmp, field)))\ + RB_COLOR(oleft, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_RIGHT(head, tmp, oleft, field);\ + tmp = RB_RIGHT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_RIGHT(tmp, field)) \ + RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_LEFT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } else { \ + tmp = RB_LEFT(parent, field); \ + if (RB_COLOR(tmp, field) == RB_RED) { \ + RB_SET_BLACKRED(tmp, parent, field); \ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + if ((RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ + (RB_RIGHT(tmp, field) == NULL || \ + RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ + RB_COLOR(tmp, field) = RB_RED; \ + elm = parent; \ + parent = RB_PARENT(elm, field); \ + } else { \ + if (RB_LEFT(tmp, field) == NULL || \ + RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ + struct type *oright; \ + if ((oright = RB_RIGHT(tmp, field)))\ + RB_COLOR(oright, field) = RB_BLACK;\ + RB_COLOR(tmp, field) = RB_RED; \ + RB_ROTATE_LEFT(head, tmp, oright, field);\ + tmp = RB_LEFT(parent, field); \ + } \ + RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ + RB_COLOR(parent, field) = RB_BLACK; \ + if (RB_LEFT(tmp, field)) \ + RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ + RB_ROTATE_RIGHT(head, parent, tmp, field);\ + elm = RB_ROOT(head); \ + break; \ + } \ + } \ + } \ + if (elm) \ + RB_COLOR(elm, field) = RB_BLACK; \ +} \ + \ +attr struct type * \ +name##_RB_REMOVE(struct name *head, struct type *elm) \ +{ \ + struct type *child, *parent, *old = elm; \ + int color; \ + if (RB_LEFT(elm, field) == NULL) \ + child = RB_RIGHT(elm, field); \ + else if (RB_RIGHT(elm, field) == NULL) \ + child = RB_LEFT(elm, field); \ + else { \ + struct type *left; \ + elm = RB_RIGHT(elm, field); \ + while ((left = RB_LEFT(elm, field))) \ + elm = left; \ + child = RB_RIGHT(elm, field); \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ + if (RB_PARENT(elm, field) == old) \ + parent = elm; \ + (elm)->field = (old)->field; \ + if (RB_PARENT(old, field)) { \ + if (RB_LEFT(RB_PARENT(old, field), field) == old)\ + RB_LEFT(RB_PARENT(old, field), field) = elm;\ + else \ + RB_RIGHT(RB_PARENT(old, field), field) = elm;\ + RB_AUGMENT(RB_PARENT(old, field)); \ + } else \ + RB_ROOT(head) = elm; \ + RB_PARENT(RB_LEFT(old, field), field) = elm; \ + if (RB_RIGHT(old, field)) \ + RB_PARENT(RB_RIGHT(old, field), field) = elm; \ + if (parent) { \ + left = parent; \ + do { \ + RB_AUGMENT(left); \ + } while ((left = RB_PARENT(left, field))); \ + } \ + goto color; \ + } \ + parent = RB_PARENT(elm, field); \ + color = RB_COLOR(elm, field); \ + if (child) \ + RB_PARENT(child, field) = parent; \ + if (parent) { \ + if (RB_LEFT(parent, field) == elm) \ + RB_LEFT(parent, field) = child; \ + else \ + RB_RIGHT(parent, field) = child; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = child; \ +color: \ + if (color == RB_BLACK) \ + name##_RB_REMOVE_COLOR(head, parent, child); \ + return (old); \ +} \ + \ +/* Inserts a node into the RB tree */ \ +attr struct type * \ +name##_RB_INSERT(struct name *head, struct type *elm) \ +{ \ + struct type *tmp; \ + struct type *parent = NULL; \ + int comp = 0; \ + tmp = RB_ROOT(head); \ + while (tmp) { \ + parent = tmp; \ + comp = (cmp)(elm, parent); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + RB_SET(elm, parent, field); \ + if (parent != NULL) { \ + if (comp < 0) \ + RB_LEFT(parent, field) = elm; \ + else \ + RB_RIGHT(parent, field) = elm; \ + RB_AUGMENT(parent); \ + } else \ + RB_ROOT(head) = elm; \ + name##_RB_INSERT_COLOR(head, elm); \ + return (NULL); \ +} \ + \ +/* Finds the node with the same key as elm */ \ +attr struct type * \ +name##_RB_FIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) \ + tmp = RB_LEFT(tmp, field); \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (NULL); \ +} \ + \ +/* Finds the first node greater than or equal to the search key */ \ +attr struct type * \ +name##_RB_NFIND(struct name *head, struct type *elm) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *res = NULL; \ + int comp; \ + while (tmp) { \ + comp = cmp(elm, tmp); \ + if (comp < 0) { \ + res = tmp; \ + tmp = RB_LEFT(tmp, field); \ + } \ + else if (comp > 0) \ + tmp = RB_RIGHT(tmp, field); \ + else \ + return (tmp); \ + } \ + return (res); \ +} \ + \ +/* ARGSUSED */ \ +attr struct type * \ +name##_RB_NEXT(struct type *elm) \ +{ \ + if (RB_RIGHT(elm, field)) { \ + elm = RB_RIGHT(elm, field); \ + while (RB_LEFT(elm, field)) \ + elm = RB_LEFT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +/* ARGSUSED */ \ +attr struct type * \ +name##_RB_PREV(struct type *elm) \ +{ \ + if (RB_LEFT(elm, field)) { \ + elm = RB_LEFT(elm, field); \ + while (RB_RIGHT(elm, field)) \ + elm = RB_RIGHT(elm, field); \ + } else { \ + if (RB_PARENT(elm, field) && \ + (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ + elm = RB_PARENT(elm, field); \ + else { \ + while (RB_PARENT(elm, field) && \ + (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ + elm = RB_PARENT(elm, field); \ + elm = RB_PARENT(elm, field); \ + } \ + } \ + return (elm); \ +} \ + \ +attr struct type * \ +name##_RB_MINMAX(struct name *head, int val) \ +{ \ + struct type *tmp = RB_ROOT(head); \ + struct type *parent = NULL; \ + while (tmp) { \ + parent = tmp; \ + if (val < 0) \ + tmp = RB_LEFT(tmp, field); \ + else \ + tmp = RB_RIGHT(tmp, field); \ + } \ + return (parent); \ +} + +#define RB_NEGINF -1 +#define RB_INF 1 + +#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) +#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) +#define RB_FIND(name, x, y) name##_RB_FIND(x, y) +#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) +#define RB_NEXT(name, x, y) name##_RB_NEXT(y) +#define RB_PREV(name, x, y) name##_RB_PREV(y) +#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) +#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) + +#define RB_FOREACH(x, name, head) \ + for ((x) = RB_MIN(name, head); \ + (x) != NULL; \ + (x) = name##_RB_NEXT(x)) + +#define RB_FOREACH_SAFE(x, name, head, y) \ + for ((x) = RB_MIN(name, head); \ + ((x) != NULL) && ((y) = name##_RB_NEXT(x), 1); \ + (x) = (y)) + +#define RB_FOREACH_REVERSE(x, name, head) \ + for ((x) = RB_MAX(name, head); \ + (x) != NULL; \ + (x) = name##_RB_PREV(x)) + +#define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \ + for ((x) = RB_MAX(name, head); \ + ((x) != NULL) && ((y) = name##_RB_PREV(x), 1); \ + (x) = (y)) + + +/* + * Copyright (c) 2016 David Gwynne + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +struct rb_type { + int (*t_compare)(const void *, const void *); + void (*t_augment)(void *); + unsigned int t_offset; /* offset of rb_entry in type */ +}; + +struct rb_tree { + struct rb_entry *rbt_root; +}; + +struct rb_entry { + struct rb_entry *rbt_parent; + struct rb_entry *rbt_left; + struct rb_entry *rbt_right; + unsigned int rbt_color; +}; + +#define RBT_HEAD(_name, _type) \ +struct _name { \ + struct rb_tree rbh_root; \ +} + +#define RBT_ENTRY(_type) struct rb_entry + +static inline void +_rb_init(struct rb_tree *rbt) +{ + rbt->rbt_root = NULL; +} + +static inline int +_rb_empty(struct rb_tree *rbt) +{ + return (rbt->rbt_root == NULL); +} + +void *_rb_insert(const struct rb_type *, struct rb_tree *, void *); +void *_rb_remove(const struct rb_type *, struct rb_tree *, void *); +void *_rb_find(const struct rb_type *, struct rb_tree *, const void *); +void *_rb_nfind(const struct rb_type *, struct rb_tree *, const void *); +void *_rb_root(const struct rb_type *, struct rb_tree *); +void *_rb_min(const struct rb_type *, struct rb_tree *); +void *_rb_max(const struct rb_type *, struct rb_tree *); +void *_rb_next(const struct rb_type *, void *); +void *_rb_prev(const struct rb_type *, void *); +void *_rb_left(const struct rb_type *, void *); +void *_rb_right(const struct rb_type *, void *); +void *_rb_parent(const struct rb_type *, void *); +void _rb_set_left(const struct rb_type *, void *, void *); +void _rb_set_right(const struct rb_type *, void *, void *); +void _rb_set_parent(const struct rb_type *, void *, void *); +void _rb_poison(const struct rb_type *, void *, unsigned long); +int _rb_check(const struct rb_type *, void *, unsigned long); + +#define RBT_INITIALIZER(_head) { { NULL } } + +#define RBT_PROTOTYPE(_name, _type, _field, _cmp) \ +extern const struct rb_type *const _name##_RBT_TYPE; \ + \ +__unused static inline void \ +_name##_RBT_INIT(struct _name *head) \ +{ \ + _rb_init(&head->rbh_root); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_INSERT(struct _name *head, struct _type *elm) \ +{ \ + return _rb_insert(_name##_RBT_TYPE, &head->rbh_root, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_REMOVE(struct _name *head, struct _type *elm) \ +{ \ + return _rb_remove(_name##_RBT_TYPE, &head->rbh_root, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_FIND(struct _name *head, const struct _type *key) \ +{ \ + return _rb_find(_name##_RBT_TYPE, &head->rbh_root, key); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_NFIND(struct _name *head, const struct _type *key) \ +{ \ + return _rb_nfind(_name##_RBT_TYPE, &head->rbh_root, key); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_ROOT(struct _name *head) \ +{ \ + return _rb_root(_name##_RBT_TYPE, &head->rbh_root); \ +} \ + \ +__unused static inline int \ +_name##_RBT_EMPTY(struct _name *head) \ +{ \ + return _rb_empty(&head->rbh_root); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_MIN(struct _name *head) \ +{ \ + return _rb_min(_name##_RBT_TYPE, &head->rbh_root); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_MAX(struct _name *head) \ +{ \ + return _rb_max(_name##_RBT_TYPE, &head->rbh_root); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_NEXT(struct _type *elm) \ +{ \ + return _rb_next(_name##_RBT_TYPE, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_PREV(struct _type *elm) \ +{ \ + return _rb_prev(_name##_RBT_TYPE, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_LEFT(struct _type *elm) \ +{ \ + return _rb_left(_name##_RBT_TYPE, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_RIGHT(struct _type *elm) \ +{ \ + return _rb_right(_name##_RBT_TYPE, elm); \ +} \ + \ +__unused static inline struct _type * \ +_name##_RBT_PARENT(struct _type *elm) \ +{ \ + return _rb_parent(_name##_RBT_TYPE, elm); \ +} \ + \ +__unused static inline void \ +_name##_RBT_SET_LEFT(struct _type *elm, struct _type *left) \ +{ \ + return _rb_set_left(_name##_RBT_TYPE, elm, left); \ +} \ + \ +__unused static inline void \ +_name##_RBT_SET_RIGHT(struct _type *elm, struct _type *right) \ +{ \ + return _rb_set_right(_name##_RBT_TYPE, elm, right); \ +} \ + \ +__unused static inline void \ +_name##_RBT_SET_PARENT(struct _type *elm, struct _type *parent) \ +{ \ + return _rb_set_parent(_name##_RBT_TYPE, elm, parent); \ +} \ + \ +__unused static inline void \ +_name##_RBT_POISON(struct _type *elm, unsigned long poison) \ +{ \ + return _rb_poison(_name##_RBT_TYPE, elm, poison); \ +} \ + \ +__unused static inline int \ +_name##_RBT_CHECK(struct _type *elm, unsigned long poison) \ +{ \ + return _rb_check(_name##_RBT_TYPE, elm, poison); \ +} + +#define RBT_GENERATE_INTERNAL(_name, _type, _field, _cmp, _aug) \ +static int \ +_name##_RBT_COMPARE(const void *lptr, const void *rptr) \ +{ \ + const struct _type *l = lptr, *r = rptr; \ + return _cmp(l, r); \ +} \ +static const struct rb_type _name##_RBT_INFO = { \ + _name##_RBT_COMPARE, \ + _aug, \ + offsetof(struct _type, _field), \ +}; \ +const struct rb_type *const _name##_RBT_TYPE = &_name##_RBT_INFO + +#define RBT_GENERATE_AUGMENT(_name, _type, _field, _cmp, _aug) \ +static void \ +_name##_RBT_AUGMENT(void *ptr) \ +{ \ + struct _type *p = ptr; \ + return _aug(p); \ +} \ +RBT_GENERATE_INTERNAL(_name, _type, _field, _cmp, _name##_RBT_AUGMENT) + +#define RBT_GENERATE(_name, _type, _field, _cmp) \ + RBT_GENERATE_INTERNAL(_name, _type, _field, _cmp, NULL) + +#define RBT_INIT(_name, _head) _name##_RBT_INIT(_head) +#define RBT_INSERT(_name, _head, _elm) _name##_RBT_INSERT(_head, _elm) +#define RBT_REMOVE(_name, _head, _elm) _name##_RBT_REMOVE(_head, _elm) +#define RBT_FIND(_name, _head, _key) _name##_RBT_FIND(_head, _key) +#define RBT_NFIND(_name, _head, _key) _name##_RBT_NFIND(_head, _key) +#define RBT_ROOT(_name, _head) _name##_RBT_ROOT(_head) +#define RBT_EMPTY(_name, _head) _name##_RBT_EMPTY(_head) +#define RBT_MIN(_name, _head) _name##_RBT_MIN(_head) +#define RBT_MAX(_name, _head) _name##_RBT_MAX(_head) +#define RBT_NEXT(_name, _elm) _name##_RBT_NEXT(_elm) +#define RBT_PREV(_name, _elm) _name##_RBT_PREV(_elm) +#define RBT_LEFT(_name, _elm) _name##_RBT_LEFT(_elm) +#define RBT_RIGHT(_name, _elm) _name##_RBT_RIGHT(_elm) +#define RBT_PARENT(_name, _elm) _name##_RBT_PARENT(_elm) +#define RBT_SET_LEFT(_name, _elm, _l) _name##_RBT_SET_LEFT(_elm, _l) +#define RBT_SET_RIGHT(_name, _elm, _r) _name##_RBT_SET_RIGHT(_elm, _r) +#define RBT_SET_PARENT(_name, _elm, _p) _name##_RBT_SET_PARENT(_elm, _p) +#define RBT_POISON(_name, _elm, _p) _name##_RBT_POISON(_elm, _p) +#define RBT_CHECK(_name, _elm, _p) _name##_RBT_CHECK(_elm, _p) + +#define RBT_FOREACH(_e, _name, _head) \ + for ((_e) = RBT_MIN(_name, (_head)); \ + (_e) != NULL; \ + (_e) = RBT_NEXT(_name, (_e))) + +#define RBT_FOREACH_SAFE(_e, _name, _head, _n) \ + for ((_e) = RBT_MIN(_name, (_head)); \ + (_e) != NULL && ((_n) = RBT_NEXT(_name, (_e)), 1); \ + (_e) = (_n)) + +#define RBT_FOREACH_REVERSE(_e, _name, _head) \ + for ((_e) = RBT_MAX(_name, (_head)); \ + (_e) != NULL; \ + (_e) = RBT_PREV(_name, (_e))) + +#define RBT_FOREACH_REVERSE_SAFE(_e, _name, _head, _n) \ + for ((_e) = RBT_MAX(_name, (_head)); \ + (_e) != NULL && ((_n) = RBT_PREV(_name, (_e)), 1); \ + (_e) = (_n)) + +#endif /* _SYS_TREE_H_ */ diff --git a/common/src/ucr_read_builder.h b/common/src/ucr_read_builder.h deleted file mode 100644 index 5b347277a..000000000 --- a/common/src/ucr_read_builder.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef UCR_READ_BUILDER_H -#define UCR_READ_BUILDER_H - -/* Generated by flatcc 0.5.3-pre FlatBuffers schema compiler for C by dvide.com */ - -#ifndef UCR_READ_READER_H -#include "ucr_read_reader.h" -#endif -#ifndef FLATBUFFERS_COMMON_BUILDER_H -#include "flatbuffers_common_builder.h" -#endif -#include "flatcc/flatcc_prologue.h" -#ifndef flatbuffers_identifier -#define flatbuffers_identifier 0 -#endif -#ifndef flatbuffers_extension -#define flatbuffers_extension ".bin" -#endif - -#define __unifyfs_Extent_formal_args , uint32_t v0, uint64_t v1, uint64_t v2 -#define __unifyfs_Extent_call_args , v0, v1, v2 -static inline unifyfs_Extent_t *unifyfs_Extent_assign(unifyfs_Extent_t *p, uint32_t v0, uint64_t v1, uint64_t v2) -{ p->fid = v0; p->offset = v1; p->length = v2; - return p; } -static inline unifyfs_Extent_t *unifyfs_Extent_copy(unifyfs_Extent_t *p, const unifyfs_Extent_t *p2) -{ p->fid = p2->fid; p->offset = p2->offset; p->length = p2->length; - return p; } -static inline unifyfs_Extent_t *unifyfs_Extent_assign_to_pe(unifyfs_Extent_t *p, uint32_t v0, uint64_t v1, uint64_t v2) -{ flatbuffers_uint32_assign_to_pe(&p->fid, v0); flatbuffers_uint64_assign_to_pe(&p->offset, v1); flatbuffers_uint64_assign_to_pe(&p->length, v2); - return p; } -static inline unifyfs_Extent_t *unifyfs_Extent_copy_to_pe(unifyfs_Extent_t *p, const unifyfs_Extent_t *p2) -{ flatbuffers_uint32_copy_to_pe(&p->fid, &p2->fid); flatbuffers_uint64_copy_to_pe(&p->offset, &p2->offset); flatbuffers_uint64_copy_to_pe(&p->length, &p2->length); - return p; } -static inline unifyfs_Extent_t *unifyfs_Extent_assign_from_pe(unifyfs_Extent_t *p, uint32_t v0, uint64_t v1, uint64_t v2) -{ flatbuffers_uint32_assign_from_pe(&p->fid, v0); flatbuffers_uint64_assign_from_pe(&p->offset, v1); flatbuffers_uint64_assign_from_pe(&p->length, v2); - return p; } -static inline unifyfs_Extent_t *unifyfs_Extent_copy_from_pe(unifyfs_Extent_t *p, const unifyfs_Extent_t *p2) -{ flatbuffers_uint32_copy_from_pe(&p->fid, &p2->fid); flatbuffers_uint64_copy_from_pe(&p->offset, &p2->offset); flatbuffers_uint64_copy_from_pe(&p->length, &p2->length); - return p; } -__flatbuffers_build_struct(flatbuffers_, unifyfs_Extent, 24, 8, unifyfs_Extent_identifier, unifyfs_Extent_type_identifier) - -static const flatbuffers_voffset_t __unifyfs_ReadRequest_required[] = { 0 }; -typedef flatbuffers_ref_t unifyfs_ReadRequest_ref_t; -static unifyfs_ReadRequest_ref_t unifyfs_ReadRequest_clone(flatbuffers_builder_t *B, unifyfs_ReadRequest_table_t t); -__flatbuffers_build_table(flatbuffers_, unifyfs_ReadRequest, 1) - -#define __unifyfs_ReadRequest_formal_args , unifyfs_Extent_vec_ref_t v0 -#define __unifyfs_ReadRequest_call_args , v0 -static inline unifyfs_ReadRequest_ref_t unifyfs_ReadRequest_create(flatbuffers_builder_t *B __unifyfs_ReadRequest_formal_args); -__flatbuffers_build_table_prolog(flatbuffers_, unifyfs_ReadRequest, unifyfs_ReadRequest_identifier, unifyfs_ReadRequest_type_identifier) - -__flatbuffers_build_vector_field(0, flatbuffers_, unifyfs_ReadRequest_extents, unifyfs_Extent, unifyfs_Extent_t, unifyfs_ReadRequest) - -static inline unifyfs_ReadRequest_ref_t unifyfs_ReadRequest_create(flatbuffers_builder_t *B __unifyfs_ReadRequest_formal_args) -{ - if (unifyfs_ReadRequest_start(B) - || unifyfs_ReadRequest_extents_add(B, v0)) { - return 0; - } - return unifyfs_ReadRequest_end(B); -} - -static unifyfs_ReadRequest_ref_t unifyfs_ReadRequest_clone(flatbuffers_builder_t *B, unifyfs_ReadRequest_table_t t) -{ - __flatbuffers_memoize_begin(B, t); - if (unifyfs_ReadRequest_start(B) - || unifyfs_ReadRequest_extents_pick(B, t)) { - return 0; - } - __flatbuffers_memoize_end(B, t, unifyfs_ReadRequest_end(B)); -} - -#include "flatcc/flatcc_epilogue.h" -#endif /* UCR_READ_BUILDER_H */ diff --git a/common/src/ucr_read_reader.h b/common/src/ucr_read_reader.h deleted file mode 100644 index eb26b9d5f..000000000 --- a/common/src/ucr_read_reader.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef UCR_READ_READER_H -#define UCR_READ_READER_H - -/* Generated by flatcc 0.5.3-pre FlatBuffers schema compiler for C by dvide.com */ - -#ifndef FLATBUFFERS_COMMON_READER_H -#include "flatbuffers_common_reader.h" -#endif -#include "flatcc/flatcc_flatbuffers.h" -#ifndef __alignas_is_defined -#include -#endif -#include "flatcc/flatcc_prologue.h" -#ifndef flatbuffers_identifier -#define flatbuffers_identifier 0 -#endif -#ifndef flatbuffers_extension -#define flatbuffers_extension ".bin" -#endif - -typedef struct unifyfs_Extent unifyfs_Extent_t; -typedef const unifyfs_Extent_t *unifyfs_Extent_struct_t; -typedef unifyfs_Extent_t *unifyfs_Extent_mutable_struct_t; -typedef const unifyfs_Extent_t *unifyfs_Extent_vec_t; -typedef unifyfs_Extent_t *unifyfs_Extent_mutable_vec_t; - -typedef const struct unifyfs_ReadRequest_table *unifyfs_ReadRequest_table_t; -typedef const flatbuffers_uoffset_t *unifyfs_ReadRequest_vec_t; -typedef flatbuffers_uoffset_t *unifyfs_ReadRequest_mutable_vec_t; -#ifndef unifyfs_Extent_identifier -#define unifyfs_Extent_identifier flatbuffers_identifier -#endif -#define unifyfs_Extent_type_hash ((flatbuffers_thash_t)0xfe153735) -#define unifyfs_Extent_type_identifier "\x35\x37\x15\xfe" -#ifndef unifyfs_ReadRequest_identifier -#define unifyfs_ReadRequest_identifier flatbuffers_identifier -#endif -#define unifyfs_ReadRequest_type_hash ((flatbuffers_thash_t)0x70b2f5ee) -#define unifyfs_ReadRequest_type_identifier "\xee\xf5\xb2\x70" - - -struct unifyfs_Extent { - alignas(8) uint32_t fid; - alignas(8) uint64_t offset; - alignas(8) uint64_t length; -}; -static_assert(sizeof(unifyfs_Extent_t) == 24, "struct size mismatch"); - -static inline const unifyfs_Extent_t *unifyfs_Extent__const_ptr_add(const unifyfs_Extent_t *p, size_t i) { return p + i; } -static inline unifyfs_Extent_t *unifyfs_Extent__ptr_add(unifyfs_Extent_t *p, size_t i) { return p + i; } -static inline unifyfs_Extent_struct_t unifyfs_Extent_vec_at(unifyfs_Extent_vec_t vec, size_t i) -__flatbuffers_struct_vec_at(vec, i) -static inline size_t unifyfs_Extent__size() { return 24; } -static inline size_t unifyfs_Extent_vec_len(unifyfs_Extent_vec_t vec) -__flatbuffers_vec_len(vec) -__flatbuffers_struct_as_root(unifyfs_Extent) - -__flatbuffers_define_struct_scalar_field(unifyfs_Extent, fid, flatbuffers_uint32, uint32_t) -__flatbuffers_define_struct_scalar_field(unifyfs_Extent, offset, flatbuffers_uint64, uint64_t) -__flatbuffers_define_struct_scalar_field(unifyfs_Extent, length, flatbuffers_uint64, uint64_t) - - -struct unifyfs_ReadRequest_table { uint8_t unused__; }; - -static inline size_t unifyfs_ReadRequest_vec_len(unifyfs_ReadRequest_vec_t vec) -__flatbuffers_vec_len(vec) -static inline unifyfs_ReadRequest_table_t unifyfs_ReadRequest_vec_at(unifyfs_ReadRequest_vec_t vec, size_t i) -__flatbuffers_offset_vec_at(unifyfs_ReadRequest_table_t, vec, i, 0) -__flatbuffers_table_as_root(unifyfs_ReadRequest) - -__flatbuffers_define_vector_field(0, unifyfs_ReadRequest, extents, unifyfs_Extent_vec_t, 0) - -#include "flatcc/flatcc_epilogue.h" -#endif /* UCR_READ_READER_H */ diff --git a/client/src/unifyfs-stack.c b/common/src/unifyfs-stack.c similarity index 97% rename from client/src/unifyfs-stack.c rename to common/src/unifyfs-stack.c index 8d40b36c9..9e8b647fd 100644 --- a/client/src/unifyfs-stack.c +++ b/common/src/unifyfs-stack.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/client/src/unifyfs-stack.h b/common/src/unifyfs-stack.h similarity index 95% rename from client/src/unifyfs-stack.h rename to common/src/unifyfs-stack.h index 3dd475e19..61ad93408 100644 --- a/client/src/unifyfs-stack.h +++ b/common/src/unifyfs-stack.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/unifyfs_client_rpcs.h b/common/src/unifyfs_client_rpcs.h index 546810937..64649bae0 100644 --- a/common/src/unifyfs_client_rpcs.h +++ b/common/src/unifyfs_client_rpcs.h @@ -1,5 +1,19 @@ -#ifndef __UNIFYFS_CLIENT_RPCS_H -#define __UNIFYFS_CLIENT_RPCS_H +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_CLIENT_RPCS_H +#define UNIFYFS_CLIENT_RPCS_H /* * Declarations for client-server margo RPCs (shared-memory) @@ -11,32 +25,55 @@ #include #include +#include "unifyfs_rpc_types.h" + #ifdef __cplusplus extern "C" { #endif -/* unifyfs_mount_rpc (client => server) +typedef enum { + UNIFYFS_CLIENT_RPC_INVALID = 0, + UNIFYFS_CLIENT_RPC_ATTACH, + UNIFYFS_CLIENT_RPC_FILESIZE, + UNIFYFS_CLIENT_RPC_LAMINATE, + UNIFYFS_CLIENT_RPC_METAGET, + UNIFYFS_CLIENT_RPC_METASET, + UNIFYFS_CLIENT_RPC_MOUNT, + UNIFYFS_CLIENT_RPC_MULTIREAD, + UNIFYFS_CLIENT_RPC_READ, + UNIFYFS_CLIENT_RPC_SYNC, + UNIFYFS_CLIENT_RPC_TRUNCATE, + UNIFYFS_CLIENT_RPC_UNLINK, + UNIFYFS_CLIENT_RPC_UNMOUNT +} client_rpc_e; + +/* unifyfs_attach_rpc (client => server) * - * connect application client to the server, and - * initialize shared memory state */ -MERCURY_GEN_PROC(unifyfs_mount_in_t, + * initialize server access to client's shared memory and file state */ +MERCURY_GEN_PROC(unifyfs_attach_in_t, ((int32_t)(app_id)) - ((int32_t)(local_rank_idx)) - ((int32_t)(dbg_rank)) - ((int32_t)(num_procs_per_node)) - ((hg_const_string_t)(client_addr_str)) - ((hg_size_t)(req_buf_sz)) - ((hg_size_t)(recv_buf_sz)) - ((hg_size_t)(superblock_sz)) + ((int32_t)(client_id)) + ((hg_size_t)(shmem_data_size)) + ((hg_size_t)(shmem_super_size)) ((hg_size_t)(meta_offset)) ((hg_size_t)(meta_size)) - ((hg_size_t)(fmeta_offset)) - ((hg_size_t)(fmeta_size)) - ((hg_size_t)(data_offset)) - ((hg_size_t)(data_size)) - ((hg_const_string_t)(external_spill_dir))) + ((hg_size_t)(logio_mem_size)) + ((hg_size_t)(logio_spill_size)) + ((hg_const_string_t)(logio_spill_dir))) +MERCURY_GEN_PROC(unifyfs_attach_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_attach_rpc) + +/* unifyfs_mount_rpc (client => server) + * + * connect application client to the server */ +MERCURY_GEN_PROC(unifyfs_mount_in_t, + ((int32_t)(dbg_rank)) + ((hg_const_string_t)(mount_prefix)) + ((hg_const_string_t)(client_addr_str))) MERCURY_GEN_PROC(unifyfs_mount_out_t, - ((hg_size_t)(max_recs_per_slice)) + ((int32_t)(app_id)) + ((int32_t)(client_id)) ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_mount_rpc) @@ -44,33 +81,20 @@ DECLARE_MARGO_RPC_HANDLER(unifyfs_mount_rpc) * * disconnect client from server */ MERCURY_GEN_PROC(unifyfs_unmount_in_t, - ((int32_t)(app_id)) - ((int32_t)(local_rank_idx))) + ((int32_t)(app_id)) + ((int32_t)(client_id))) MERCURY_GEN_PROC(unifyfs_unmount_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_unmount_rpc) -/* need to transfer timespec structs */ -typedef struct timespec sys_timespec_t; -MERCURY_GEN_STRUCT_PROC(sys_timespec_t, - ((uint64_t)(tv_sec)) - ((uint64_t)(tv_nsec))) - /* unifyfs_metaset_rpc (client => server) * * given a global file id and a file name, * record key/value entry for this file */ MERCURY_GEN_PROC(unifyfs_metaset_in_t, - ((hg_const_string_t)(filename)) - ((int32_t)(fid)) - ((int32_t)(gfid)) - ((uint32_t)(mode)) - ((uint32_t)(uid)) - ((uint32_t)(gid)) - ((uint64_t)(size)) - ((sys_timespec_t)(atime)) - ((sys_timespec_t)(mtime)) - ((sys_timespec_t)(ctime)) - ((uint32_t)(is_laminated))) + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(attr_op)) + ((unifyfs_file_attr_t)(attr))) MERCURY_GEN_PROC(unifyfs_metaset_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_metaset_rpc) @@ -79,30 +103,22 @@ DECLARE_MARGO_RPC_HANDLER(unifyfs_metaset_rpc) * returns file metadata including size and name * given a global file id */ MERCURY_GEN_PROC(unifyfs_metaget_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) ((int32_t)(gfid))) MERCURY_GEN_PROC(unifyfs_metaget_out_t, ((int32_t)(ret)) - ((hg_const_string_t)(filename)) - ((int32_t)(fid)) - ((int32_t)(gfid)) - ((uint32_t)(mode)) - ((uint32_t)(uid)) - ((uint32_t)(gid)) - ((uint64_t)(size)) - ((sys_timespec_t)(atime)) - ((sys_timespec_t)(mtime)) - ((sys_timespec_t)(ctime)) - ((uint32_t)(is_laminated))) + ((unifyfs_file_attr_t)(attr))) DECLARE_MARGO_RPC_HANDLER(unifyfs_metaget_rpc) /* unifyfs_fsync_rpc (client => server) * - * given app_id, client_id, and a global file id as input, - * read extent location metadata from client shared memory - * and insert corresponding key/value pairs into global index */ + * given a client identified by (app_id, client_id) as input, read the write + * extents for one or more of the client's files from the shared memory index + * and update the global metadata for the file(s) */ MERCURY_GEN_PROC(unifyfs_fsync_in_t, ((int32_t)(app_id)) - ((int32_t)(local_rank_idx)) + ((int32_t)(client_id)) ((int32_t)(gfid))) MERCURY_GEN_PROC(unifyfs_fsync_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_fsync_rpc) @@ -113,20 +129,57 @@ DECLARE_MARGO_RPC_HANDLER(unifyfs_fsync_rpc) * return filesize for given file */ MERCURY_GEN_PROC(unifyfs_filesize_in_t, ((int32_t)(app_id)) - ((int32_t)(local_rank_idx)) + ((int32_t)(client_id)) ((int32_t)(gfid))) MERCURY_GEN_PROC(unifyfs_filesize_out_t, ((int32_t)(ret)) ((hg_size_t)(filesize))) DECLARE_MARGO_RPC_HANDLER(unifyfs_filesize_rpc) +/* unifyfs_truncate_rpc (client => server) + * + * given an app_id, client_id, global file id, + * and a filesize, truncate file to that size */ +MERCURY_GEN_PROC(unifyfs_truncate_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(gfid)) + ((hg_size_t)(filesize))) +MERCURY_GEN_PROC(unifyfs_truncate_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_truncate_rpc) + +/* unifyfs_unlink_rpc (client => server) + * + * given an app_id, client_id, and global file id, + * unlink the file */ +MERCURY_GEN_PROC(unifyfs_unlink_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(gfid))) +MERCURY_GEN_PROC(unifyfs_unlink_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_unlink_rpc) + +/* unifyfs_laminate_rpc (client => server) + * + * given an app_id, client_id, and global file id, + * laminate the file */ +MERCURY_GEN_PROC(unifyfs_laminate_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(gfid))) +MERCURY_GEN_PROC(unifyfs_laminate_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_laminate_rpc) + /* unifyfs_read_rpc (client => server) * * given an app_id, client_id, global file id, an offset, and a length, * initiate read request for data */ MERCURY_GEN_PROC(unifyfs_read_in_t, ((int32_t)(app_id)) - ((int32_t)(local_rank_idx)) + ((int32_t)(client_id)) ((int32_t)(gfid)) ((hg_size_t)(offset)) ((hg_size_t)(length))) @@ -135,12 +188,12 @@ DECLARE_MARGO_RPC_HANDLER(unifyfs_read_rpc) /* unifyfs_mread_rpc (client => server) * - * given an app_id, client_id, global file id, and a count - * of read requests, followed by list of offset/length tuples + * given an app_id, client_id, and count of read requests, + * followed by list of (gfid, offset, length) tuples, * initiate read requests for data */ MERCURY_GEN_PROC(unifyfs_mread_in_t, ((int32_t)(app_id)) - ((int32_t)(local_rank_idx)) + ((int32_t)(client_id)) ((int32_t)(read_count)) ((hg_size_t)(bulk_size)) ((hg_bulk_t)(bulk_handle))) diff --git a/common/src/unifyfs_configurator.c b/common/src/unifyfs_configurator.c index 56e1bba34..ac5daaed1 100644 --- a/common/src/unifyfs_configurator.c +++ b/common/src/unifyfs_configurator.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -58,7 +58,7 @@ int unifyfs_config_init(unifyfs_cfg_t *cfg, char *syscfg = NULL; if (cfg == NULL) - return -1; + return EINVAL; memset((void *)cfg, 0, sizeof(unifyfs_cfg_t)); @@ -101,14 +101,14 @@ int unifyfs_config_init(unifyfs_cfg_t *cfg, if (rc) return rc; - return 0; + return (int)UNIFYFS_SUCCESS; } // cleanup allocated state int unifyfs_config_fini(unifyfs_cfg_t *cfg) { if (cfg == NULL) - return -1; + return EINVAL; #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ if (cfg->sec##_##key != NULL) { \ @@ -146,7 +146,7 @@ int unifyfs_config_fini(unifyfs_cfg_t *cfg) #undef UNIFYFS_CFG_MULTI #undef UNIFYFS_CFG_MULTI_CLI - return 0; + return (int)UNIFYFS_SUCCESS; } // print configuration to specified file (or stderr) @@ -266,7 +266,7 @@ int unifyfs_config_set_defaults(unifyfs_cfg_t *cfg) char *val; if (cfg == NULL) - return -1; + return EINVAL; #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ val = stringify(dv); \ @@ -292,7 +292,7 @@ int unifyfs_config_set_defaults(unifyfs_cfg_t *cfg) #undef UNIFYFS_CFG_MULTI #undef UNIFYFS_CFG_MULTI_CLI - return 0; + return (int)UNIFYFS_SUCCESS; } @@ -363,7 +363,7 @@ int unifyfs_config_process_cli_args(unifyfs_cfg_t *cfg, extern int optind, optopt; if (cfg == NULL) - return -1; + return EINVAL; // setup short_opts and cli_options memset((void *)short_opts, 0, sizeof(short_opts)); @@ -461,9 +461,9 @@ int unifyfs_config_process_cli_args(unifyfs_cfg_t *cfg, } if (!usage_err) - rc = 0; + rc = (int)UNIFYFS_SUCCESS; else { - rc = -1; + rc = (int)UNIFYFS_FAILURE; unifyfs_config_cli_usage_error(argv[0], errmsg); } @@ -512,7 +512,7 @@ int unifyfs_config_process_environ(unifyfs_cfg_t *cfg) char *envval; if (cfg == NULL) - return -1; + return EINVAL; #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ @@ -559,7 +559,7 @@ int unifyfs_config_process_environ(unifyfs_cfg_t *cfg) #undef UNIFYFS_CFG_MULTI #undef UNIFYFS_CFG_MULTI_CLI - return 0; + return (int)UNIFYFS_SUCCESS; } // inih callback handler @@ -611,7 +611,7 @@ int inih_config_handler(void *user, cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(val); \ } - UNIFYFS_CONFIGS; +UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI #undef UNIFYFS_CFG_MULTI @@ -636,7 +636,7 @@ int unifyfs_config_process_ini_file(unifyfs_cfg_t *cfg, inih_rc = ini_parse(file, inih_config_handler, cfg); switch (inih_rc) { case 0: - rc = 0; + rc = (int)UNIFYFS_SUCCESS; break; case -1: snprintf(errmsg, sizeof(errmsg), @@ -662,7 +662,7 @@ int unifyfs_config_process_ini_file(unifyfs_cfg_t *cfg, snprintf(errmsg, sizeof(errmsg), "failed to parse config file %s", file); - rc = EINVAL; + rc = (int)UNIFYFS_ERROR_BADCONFIG; fprintf(stderr, "UNIFYFS CONFIG ERROR: %s\n", errmsg); break; } @@ -697,7 +697,7 @@ int validate_value(const char *section, // validate configuration int unifyfs_config_validate(unifyfs_cfg_t *cfg) { - int rc = 0; + int rc = (int)UNIFYFS_SUCCESS; int vrc; char *new_val = NULL; diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index e2cb36f08..54d0f5584 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -70,28 +70,28 @@ UNIFYFS_CFG_CLI(unifyfs, daemonize, BOOL, on, "enable server daemonization", NULL, 'D', "on|off") \ UNIFYFS_CFG_CLI(unifyfs, mountpoint, STRING, /unifyfs, "mountpoint directory", NULL, 'm', "specify full path to desired mountpoint") \ UNIFYFS_CFG(client, max_files, INT, UNIFYFS_MAX_FILES, "client max file count", NULL) \ + UNIFYFS_CFG(client, local_extents, BOOL, off, "track extents to service reads of local data", NULL) \ + UNIFYFS_CFG(client, recv_data_size, INT, UNIFYFS_DATA_RECV_SIZE, "shared memory segment size in bytes for receiving data from server", NULL) \ + UNIFYFS_CFG(client, write_index_size, INT, UNIFYFS_INDEX_BUF_SIZE, "write metadata index buffer size", NULL) \ + UNIFYFS_CFG(client, cwd, STRING, NULLSTRING, "current working directory", NULL) \ + UNIFYFS_CFG(client, write_sync, BOOL, off, "sync every write to server", NULL) \ UNIFYFS_CFG_CLI(log, verbosity, INT, 0, "log verbosity level", NULL, 'v', "specify logging verbosity level") \ UNIFYFS_CFG_CLI(log, file, STRING, unifyfsd.log, "log file name", NULL, 'l', "specify log file name") \ UNIFYFS_CFG_CLI(log, dir, STRING, LOGDIR, "log file directory", configurator_directory_check, 'L', "specify full path to directory to contain log file") \ - UNIFYFS_CFG(logfs, index_buf_size, INT, UNIFYFS_INDEX_BUF_SIZE, "log file system index buffer size", NULL) \ - UNIFYFS_CFG(logfs, attr_buf_size, INT, UNIFYFS_FATTR_BUF_SIZE, "log file system file attributes buffer size", NULL) \ + UNIFYFS_CFG(logio, chunk_size, INT, UNIFYFS_LOGIO_CHUNK_SIZE, "log-based I/O data chunk size", NULL) \ + UNIFYFS_CFG(logio, shmem_size, INT, UNIFYFS_LOGIO_SHMEM_SIZE, "log-based I/O shared memory region size", NULL) \ + UNIFYFS_CFG(logio, spill_size, INT, UNIFYFS_LOGIO_SPILL_SIZE, "log-based I/O spillover file size", NULL) \ + UNIFYFS_CFG(logio, spill_dir, STRING, NULLSTRING, "spillover directory", configurator_directory_check) \ UNIFYFS_CFG(margo, tcp, BOOL, on, "use TCP for server-server margo RPCs", NULL) \ UNIFYFS_CFG(meta, db_name, STRING, META_DEFAULT_DB_NAME, "metadata database name", NULL) \ - UNIFYFS_CFG(meta, db_path, STRING, /tmp, "metadata database path", NULL) \ + UNIFYFS_CFG(meta, db_path, STRING, RUNDIR, "metadata database path", configurator_directory_check) \ UNIFYFS_CFG(meta, server_ratio, INT, META_DEFAULT_SERVER_RATIO, "metadata server ratio", NULL) \ UNIFYFS_CFG(meta, range_size, INT, META_DEFAULT_RANGE_SZ, "metadata range size", NULL) \ - UNIFYFS_CFG_CLI(runstate, dir, STRING, RUNDIR, "runstate file directory", configurator_directory_check, 'R', "specify full path to directory to contain server runstate file") \ + UNIFYFS_CFG_CLI(runstate, dir, STRING, RUNDIR, "runstate file directory", configurator_directory_check, 'R', "specify full path to directory to contain server-local state") \ UNIFYFS_CFG_CLI(server, hostfile, STRING, NULLSTRING, "server hostfile name", NULL, 'H', "specify full path to server hostfile") \ + UNIFYFS_CFG_CLI(server, init_timeout, INT, UNIFYFS_DEFAULT_INIT_TIMEOUT, "timeout of waiting for server initialization", NULL, 't', "timeout in seconds to wait for servers to be ready for clients") \ + UNIFYFS_CFG(server, max_app_clients, INT, MAX_APP_CLIENTS, "maximum number of clients per application", NULL) \ UNIFYFS_CFG_CLI(sharedfs, dir, STRING, NULLSTRING, "shared file system directory", configurator_directory_check, 'S', "specify full path to directory to contain server shared files") \ - UNIFYFS_CFG(shmem, chunk_bits, INT, UNIFYFS_CHUNK_BITS, "shared memory data chunk size in bits (i.e., size=2^bits)", NULL) \ - UNIFYFS_CFG(shmem, chunk_mem, INT, UNIFYFS_CHUNK_MEM, "shared memory segment size for data chunks", NULL) \ - UNIFYFS_CFG(shmem, recv_size, INT, UNIFYFS_SHMEM_RECV_SIZE, "shared memory segment size in bytes for receiving data from delegators", NULL) \ - UNIFYFS_CFG(shmem, req_size, INT, UNIFYFS_SHMEM_REQ_SIZE, "shared memory segment size in bytes for sending requests to delegators", NULL) \ - UNIFYFS_CFG(shmem, single, BOOL, off, "use single shared memory region for all clients", NULL) \ - UNIFYFS_CFG(spillover, enabled, BOOL, on, "use local device for data chunk spillover", NULL) \ - UNIFYFS_CFG(spillover, data_dir, STRING, NULLSTRING, "spillover data directory", configurator_directory_check) \ - UNIFYFS_CFG(spillover, meta_dir, STRING, NULLSTRING, "spillover metadata directory", configurator_directory_check) \ - UNIFYFS_CFG(spillover, size, INT, UNIFYFS_SPILLOVER_SIZE, "spillover max data size in bytes", NULL) \ #ifdef __cplusplus extern "C" { diff --git a/common/src/unifyfs_const.h b/common/src/unifyfs_const.h index 1474b35c6..39f0eb599 100644 --- a/common/src/unifyfs_const.h +++ b/common/src/unifyfs_const.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -30,9 +30,8 @@ #ifndef UNIFYFS_CONST_H #define UNIFYFS_CONST_H -/* ********************** ERROR CODES ************************ */ -#include "err_enumerator.h" -#define ULFS_SUCCESS ((int)UNIFYFS_SUCCESS) +/* ********************** RETURN CODES ************************ */ +#include "unifyfs_rc.h" /* ********************** STRING CONSTANTS ************************ */ #define DEFAULT_INTERFACE "ib0" @@ -50,47 +49,38 @@ #define UNIFYFS_MAX_FILENAME KIB #define UNIFYFS_MAX_HOSTNAME 64 -// Metadata -#define MAX_FILE_CNT_PER_NODE KIB - -// Request Manager -#define RECV_BUF_CNT 4 /* number of remote read buffers */ -#define SENDRECV_BUF_LEN (8 * MIB) /* remote read buffer size */ +// Server - Request Manager +#define MAX_DATA_TX_SIZE (4 * MIB) /* data transfer size (to client) */ #define MAX_META_PER_SEND (4 * KIB) /* max read request count per server */ -#define REQ_BUF_LEN (MAX_META_PER_SEND * 128) /* read requests (send_msg_t) */ +#define REQ_BUF_LEN (MAX_META_PER_SEND * 64) /* chunk read reqs buffer size */ #define SHM_WAIT_INTERVAL 1000 /* unit: ns */ #define RM_MAX_ACTIVE_REQUESTS 64 /* number of concurrent read requests */ -// Service Manager -#define LARGE_BURSTY_DATA (512 * MIB) -#define MAX_BURSTY_INTERVAL 10000 /* unit: us */ -#define MIN_SLEEP_INTERVAL 10 /* unit: us */ -#define SLEEP_INTERVAL 500 /* unit: us */ -#define SLEEP_SLICE_PER_UNIT 50 /* unit: us */ -#define READ_BLOCK_SIZE MIB -#define READ_BUF_SZ GIB +// Server - Service Manager +#define MIN_SLEEP_INTERVAL 50 /* unit: us */ -// Request and Service Managers, Command Handler -#define MAX_NUM_CLIENTS 64 /* app processes per server */ - -// Client and Command Handler -#define CMD_BUF_SIZE (2 * KIB) +// Server - General +#define MAX_BULK_TX_SIZE (8 * MIB) /* bulk transfer size (between servers) */ +#define MAX_NUM_APPS 64 /* max # apps/mountpoints supported */ +#define MAX_APP_CLIENTS 256 /* max # clients per application */ +#define UNIFYFS_DEFAULT_INIT_TIMEOUT 120 /* server init timeout (seconds) */ +#define UNIFYFSD_PID_FILENAME "unifyfsd.pids" +#define UNIFYFS_STAGE_STATUS_FILENAME "unifyfs-stage.status" // Client #define UNIFYFS_MAX_FILES 128 #define UNIFYFS_MAX_FILEDESCS UNIFYFS_MAX_FILES #define UNIFYFS_STREAM_BUFSIZE MIB -#define UNIFYFS_CHUNK_BITS 24 -#define UNIFYFS_CHUNK_MEM (256 * MIB) -#define UNIFYFS_SPILLOVER_SIZE (KIB * MIB) -#define UNIFYFS_SUPERBLOCK_KEY 4321 -#define UNIFYFS_SHMEM_REQ_SIZE (8 * MIB) -#define UNIFYFS_SHMEM_RECV_SIZE (32 * MIB) +#define UNIFYFS_DATA_RECV_SIZE (32 * MIB) #define UNIFYFS_INDEX_BUF_SIZE (20 * MIB) -#define UNIFYFS_FATTR_BUF_SIZE MIB #define UNIFYFS_MAX_READ_CNT KIB -/* max read size = UNIFYFS_MAX_SPLIT_CNT * META_DEFAULT_RANGE_SZ */ +// Log-based I/O +#define UNIFYFS_LOGIO_CHUNK_SIZE (4 * MIB) +#define UNIFYFS_LOGIO_SHMEM_SIZE (256 * MIB) +#define UNIFYFS_LOGIO_SPILL_SIZE (GIB) + +/* NOTE: max read size = UNIFYFS_MAX_SPLIT_CNT * META_DEFAULT_RANGE_SZ */ #define UNIFYFS_MAX_SPLIT_CNT (4 * KIB) // Metadata/MDHIM Default Values diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index e5f528f88..e73237e9b 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -15,6 +15,7 @@ #include "unifyfs_const.h" #include "unifyfs_keyval.h" #include "unifyfs_log.h" +#include "unifyfs_misc.h" //#include "config.h" @@ -33,11 +34,10 @@ #include // UnifyFS keys -const char* key_runstate = "unifyfs.runstate"; -const char* key_unifyfsd_socket = "unifyfsd.socket"; -const char* key_unifyfsd_margo_shm = "unifyfsd.margo-shm"; -const char* key_unifyfsd_margo_svr = "unifyfsd.margo-svr"; -const char* key_unifyfsd_mpi_rank = "unifyfsd.mpi-rank"; +const char* const key_unifyfsd_socket = "unifyfsd.socket"; +const char* const key_unifyfsd_margo_shm = "unifyfsd.margo-shm"; +const char* const key_unifyfsd_margo_svr = "unifyfsd.margo-svr"; +const char* const key_unifyfsd_pmi_rank = "unifyfsd.pmi-rank"; // key-value store state static int kv_initialized; // = 0 @@ -143,7 +143,7 @@ static int unifyfs_pmi2_init(void) if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_Init() failed: %s", pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } pmi_world_rank = rank; pmi_world_nprocs = nprocs; @@ -155,7 +155,7 @@ static int unifyfs_pmi2_init(void) if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_Job_GetRank() failed: %s", pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } else { pmi_world_rank = rank; } @@ -185,7 +185,7 @@ static int unifyfs_pmi2_init(void) if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_Job_GetId() failed: %s", pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } kv_myrank = pmi_world_rank; @@ -209,7 +209,7 @@ static int unifyfs_pmi2_fini(void) if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_Finalize() failed: %s", pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } pmi2_need_finalize = 0; pmi2_initialized = 0; @@ -227,7 +227,7 @@ static int unifyfs_pmi2_lookup(const char* key, char pmi2_val[PMI2_MAX_VALLEN] = {0}; if (!pmi2_initialized) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } strncpy(pmi2_key, key, sizeof(pmi2_key)); @@ -237,7 +237,7 @@ static int unifyfs_pmi2_lookup(const char* key, if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_KVS_Get(%s) failed: %s", key, pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } *oval = strdup(pmi2_val); return (int)UNIFYFS_SUCCESS; @@ -252,7 +252,7 @@ static int unifyfs_pmi2_publish(const char* key, char pmi2_val[PMI2_MAX_VALLEN] = {0}; if (!pmi2_initialized) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } strncpy(pmi2_key, key, sizeof(pmi2_key)); @@ -261,13 +261,20 @@ static int unifyfs_pmi2_publish(const char* key, if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_KVS_Put(%s) failed: %s", key, pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } - rc = PMI2_KVS_Fence(); + return (int)UNIFYFS_SUCCESS; +} + +static int unifyfs_pmi2_fence(void) +{ + /* PMI2_KVS_Fence() is a collective barrier that ensures + * all previous KVS_Put()s are visible */ + int rc = PMI2_KVS_Fence(); if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); LOGERR("PMI2_KVS_Fence() failed: %s", pmi2_errstr); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } return (int)UNIFYFS_SUCCESS; } @@ -300,7 +307,7 @@ static int unifyfs_pmix_init(void) rc = PMIx_Init(&pmix_myproc, NULL, 0); if (rc != PMIX_SUCCESS) { LOGERR("PMIx_Init() failed: %s", PMIx_Error_string(rc)); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } kv_max_keylen = PMIX_MAX_KEYLEN; @@ -314,7 +321,7 @@ static int unifyfs_pmix_init(void) if (rc != PMIX_SUCCESS) { LOGERR("PMIx rank %d: PMIx_Get(UNIV_SIZE) failed: %s", pmix_myproc.rank, PMIx_Error_string(rc)); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } pmix_univ_nprocs = (size_t) valp->data.uint32; @@ -358,7 +365,7 @@ static int unifyfs_pmix_fini(void) if (rc != PMIX_SUCCESS) { LOGERR("PMIx rank %d: PMIx_Finalize() failed: %s", pmix_myproc.rank, PMIx_Error_string(rc)); - rc = (int) UNIFYFS_FAILURE; + rc = (int) UNIFYFS_ERROR_PMI; } else { PMIX_PROC_DESTRUCT(&pmix_myproc); pmix_initialized = 0; @@ -378,7 +385,7 @@ static int unifyfs_pmix_lookup(const char* key, char pmix_key[PMIX_MAX_KEYLEN+1]; if (!pmix_initialized) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } /* set key to lookup */ @@ -400,7 +407,7 @@ static int unifyfs_pmix_lookup(const char* key, LOGERR("PMIx rank %d: PMIx_Lookup(%s) failed: %s", pmix_myproc.rank, pmix_key, PMIx_Error_string(rc)); *oval = NULL; - rc = (int)UNIFYFS_FAILURE; + rc = (int)UNIFYFS_ERROR_PMI; } else { if (pdata[0].value.data.string != NULL) { *oval = strdup(pdata[0].value.data.string); @@ -409,7 +416,7 @@ static int unifyfs_pmix_lookup(const char* key, LOGERR("PMIx rank %d: PMIx_Lookup(%s) returned NULL string", pmix_myproc.rank, pmix_key); *oval = NULL; - rc = (int)UNIFYFS_FAILURE; + rc = (int)UNIFYFS_ERROR_PMI; } } /* cleanup */ @@ -429,7 +436,7 @@ static int unifyfs_pmix_publish(const char* key, char pmix_key[PMIX_MAX_KEYLEN+1]; if (!pmix_initialized) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_PMI; } /* set key-val and modify publish behavior */ @@ -445,7 +452,7 @@ static int unifyfs_pmix_publish(const char* key, if (rc != PMIX_SUCCESS) { LOGERR("PMIx rank %d: PMIx_Publish failed: %s", pmix_myproc.rank, PMIx_Error_string(rc)); - rc = (int)UNIFYFS_FAILURE; + rc = (int)UNIFYFS_ERROR_PMI; } else { rc = (int)UNIFYFS_SUCCESS; } @@ -454,6 +461,20 @@ static int unifyfs_pmix_publish(const char* key, return rc; } +static int unifyfs_pmix_fence(void) +{ + // PMIx_Fence is a collective barrier across all processes in my namespace + int rc = PMIx_Fence(NULL, 0, NULL, 0); + if (rc != PMIX_SUCCESS) { + LOGERR("PMIx rank %d: PMIx_Fence failed: %s", + pmix_myproc.rank, PMIx_Error_string(rc)); + rc = (int)UNIFYFS_ERROR_PMI; + } else { + rc = (int)UNIFYFS_SUCCESS; + } + return rc; +} + #endif // USE_PMIX @@ -472,7 +493,7 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) if (NULL == cfg) { LOGERR("NULL config"); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } memset(localfs_kvdir, 0, sizeof(localfs_kvdir)); @@ -482,7 +503,7 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) // find or create local kvstore directory if (NULL == cfg->runstate_dir) { LOGERR("local file system k-v store requires cfg.runstate_dir"); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_BADCONFIG; } snprintf(localfs_kvdir, sizeof(localfs_kvdir), "%s/kvstore", cfg->runstate_dir); @@ -496,11 +517,11 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) if ((rc != 0) && (err != EEXIST)) { LOGERR("failed to create local kvstore directory %s - %s", localfs_kvdir, strerror(err)); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } else { LOGERR("missing local kvstore directory %s", localfs_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } @@ -517,12 +538,12 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) if ((rc != 0) && (err != EEXIST)) { LOGERR("failed to create kvstore directory %s - %s", sharedfs_kvdir, strerror(err)); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } // find or create rank-specific subdir - snprintf(sharedfs_rank_kvdir, sizeof(sharedfs_rank_kvdir), "%s/%d", + scnprintf(sharedfs_rank_kvdir, sizeof(sharedfs_rank_kvdir), "%s/%d", sharedfs_kvdir, kv_myrank); memset(&s, 0, sizeof(struct stat)); rc = stat(sharedfs_rank_kvdir, &s); @@ -533,7 +554,7 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) if ((rc != 0) && (err != EEXIST)) { LOGERR("failed to create rank kvstore directory %s - %s", sharedfs_rank_kvdir, strerror(err)); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } have_sharedfs_kvstore = 1; @@ -562,7 +583,7 @@ static int unifyfs_fskv_fini(void) DIR* lkv = opendir(localfs_kvdir); if (NULL == lkv) { LOGERR("failed to opendir(%s)", localfs_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } while (NULL != (de = readdir(lkv))) { if ((0 == strcmp(".", de->d_name)) || @@ -570,7 +591,7 @@ static int unifyfs_fskv_fini(void) continue; } memset(kvfile, 0, sizeof(kvfile)); - snprintf(kvfile, sizeof(kvfile), "%s/%s", + scnprintf(kvfile, sizeof(kvfile), "%s/%s", localfs_kvdir, de->d_name); rc = remove(kvfile); if (rc != 0) { @@ -585,7 +606,7 @@ static int unifyfs_fskv_fini(void) if (rc != 0) { LOGERR("failed to remove local kvstore dir %s", sharedfs_rank_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } @@ -599,7 +620,7 @@ static int unifyfs_fskv_fini(void) DIR* rkv = opendir(sharedfs_rank_kvdir); if (NULL == rkv) { LOGERR("failed to opendir(%s)", sharedfs_rank_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } while (NULL != (de = readdir(rkv))) { if ((0 == strcmp(".", de->d_name)) || @@ -607,7 +628,7 @@ static int unifyfs_fskv_fini(void) continue; } memset(rank_kvfile, 0, sizeof(rank_kvfile)); - snprintf(rank_kvfile, sizeof(rank_kvfile), "%s/%s", + scnprintf(rank_kvfile, sizeof(rank_kvfile), "%s/%s", sharedfs_rank_kvdir, de->d_name); rc = remove(rank_kvfile); if (rc != 0) { @@ -622,7 +643,7 @@ static int unifyfs_fskv_fini(void) if (rc != 0) { LOGERR("failed to remove rank-specific kvstore dir %s", sharedfs_rank_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } @@ -635,7 +656,7 @@ static int unifyfs_fskv_fini(void) DIR* skv = opendir(sharedfs_kvdir); if (NULL == skv) { LOGERR("failed to opendir(%s)", sharedfs_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } while (NULL != (de = readdir(skv))) { if ((0 == strcmp(".", de->d_name)) || @@ -651,7 +672,7 @@ static int unifyfs_fskv_fini(void) if (rc != 0) { LOGERR("failed to remove sharedfs kvstore dir %s", sharedfs_kvdir); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } } } @@ -667,17 +688,22 @@ static int unifyfs_fskv_lookup_local(const char* key, FILE* kvf; char kvfile[UNIFYFS_MAX_FILENAME]; char kvalue[kv_max_vallen]; + int rc; - snprintf(kvfile, sizeof(kvfile), "%s/%s", + scnprintf(kvfile, sizeof(kvfile), "%s/%s", localfs_kvdir, key); kvf = fopen(kvfile, "r"); if (NULL == kvf) { LOGERR("failed to open kvstore entry %s", kvfile); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } memset(kvalue, 0, sizeof(kvalue)); - fscanf(kvf, "%s\n", kvalue); + rc = fscanf(kvf, "%s\n", kvalue); fclose(kvf); + if (rc != 1) { + *oval = NULL; + return (int)UNIFYFS_FAILURE; + } *oval = strdup(kvalue); return (int)UNIFYFS_SUCCESS; @@ -690,22 +716,28 @@ static int unifyfs_fskv_lookup_remote(int rank, FILE* kvf; char rank_kvfile[UNIFYFS_MAX_FILENAME]; char kvalue[kv_max_vallen]; + int rc; if (!have_sharedfs_kvstore) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } - snprintf(rank_kvfile, sizeof(rank_kvfile), "%s/%d/%s", + scnprintf(rank_kvfile, sizeof(rank_kvfile), "%s/%d/%s", sharedfs_kvdir, rank, key); kvf = fopen(rank_kvfile, "r"); if (NULL == kvf) { LOGERR("failed to open kvstore entry %s", rank_kvfile); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } memset(kvalue, 0, sizeof(kvalue)); - fscanf(kvf, "%s\n", kvalue); + rc = fscanf(kvf, "%s\n", kvalue); fclose(kvf); + if (rc != 1) { + *oval = NULL; + return (int)UNIFYFS_FAILURE; + } + *oval = strdup(kvalue); return (int)UNIFYFS_SUCCESS; } @@ -717,12 +749,12 @@ static int unifyfs_fskv_publish_local(const char* key, FILE* kvf; char kvfile[UNIFYFS_MAX_FILENAME]; - snprintf(kvfile, sizeof(kvfile), "%s/%s", + scnprintf(kvfile, sizeof(kvfile), "%s/%s", localfs_kvdir, key); kvf = fopen(kvfile, "w"); if (NULL == kvf) { LOGERR("failed to create kvstore entry %s", kvfile); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } fprintf(kvf, "%s\n", val); fclose(kvf); @@ -737,15 +769,15 @@ static int unifyfs_fskv_publish_remote(const char* key, char rank_kvfile[UNIFYFS_MAX_FILENAME]; if (!have_sharedfs_kvstore) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } - snprintf(rank_kvfile, sizeof(rank_kvfile), "%s/%s", + scnprintf(rank_kvfile, sizeof(rank_kvfile), "%s/%s", sharedfs_rank_kvdir, key); kvf = fopen(rank_kvfile, "w"); if (NULL == kvf) { LOGERR("failed to create kvstore entry %s", rank_kvfile); - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } fprintf(kvf, "%s\n", val); fclose(kvf); @@ -753,6 +785,22 @@ static int unifyfs_fskv_publish_remote(const char* key, return (int)UNIFYFS_SUCCESS; } +static int unifyfs_fskv_fence(void) +{ + if (!have_sharedfs_kvstore) { + return (int)UNIFYFS_ERROR_KEYVAL; + } + + if (1 == kv_nranks) { + return (int)UNIFYFS_SUCCESS; + } + + // TODO - use a file as a counting semaphore?? + sleep(10); + + return (int)UNIFYFS_SUCCESS; +} + //--------------------- K-V Store API --------------------- // Initialize key-value store @@ -836,11 +884,11 @@ int unifyfs_keyval_fini(void) int unifyfs_keyval_lookup_local(const char* key, char** oval) { - int rc = UNIFYFS_FAILURE; + int rc; if ((NULL == key) || (NULL == oval)) { LOGERR("NULL parameter"); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } if (!kv_initialized) { @@ -854,7 +902,7 @@ int unifyfs_keyval_lookup_local(const char* key, if (len >= (kv_max_keylen - 1)) { LOGERR("length of key (%zd) exceeds max %zd", len, kv_max_keylen); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } // do the lookup @@ -870,11 +918,11 @@ int unifyfs_keyval_lookup_remote(int rank, const char* key, char** oval) { - int rc = UNIFYFS_FAILURE; + int rc; if ((NULL == key) || (NULL == oval)) { LOGERR("NULL parameter"); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } if (!kv_initialized) { @@ -889,7 +937,7 @@ int unifyfs_keyval_lookup_remote(int rank, if (len >= (kv_max_keylen - 1)) { LOGERR("length of key (%zd) exceeds max %zd", len, kv_max_keylen); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } // generate full key, which includes remote host @@ -902,6 +950,8 @@ int unifyfs_keyval_lookup_remote(int rank, rc = unifyfs_pmix_lookup(rank_key, oval); #elif defined(USE_PMI2) rc = unifyfs_pmi2_lookup(rank_key, oval); +#else + rc = (int)UNIFYFS_FAILURE; #endif if (rc != (int)UNIFYFS_SUCCESS) { rc = unifyfs_fskv_lookup_remote(rank, key, oval); @@ -916,29 +966,29 @@ int unifyfs_keyval_lookup_remote(int rank, int unifyfs_keyval_publish_local(const char* key, const char* val) { - int rc = UNIFYFS_FAILURE; + int rc; if (!kv_initialized) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } if ((key == NULL) || (val == NULL)) { LOGERR("NULL key or value"); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } size_t len = strlen(key); if (len >= (kv_max_keylen - 1)) { LOGERR("length of key (%zd) exceeds max %zd", len, kv_max_keylen); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } len = strlen(val); if (len >= kv_max_vallen) { LOGERR("length of val (%zd) exceeds max %zd", len, kv_max_vallen); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } // publish it @@ -955,15 +1005,15 @@ int unifyfs_keyval_publish_local(const char* key, int unifyfs_keyval_publish_remote(const char* key, const char* val) { - int rc = UNIFYFS_FAILURE; + int rc; if (!kv_initialized) { - return (int)UNIFYFS_FAILURE; + return (int)UNIFYFS_ERROR_KEYVAL; } if ((key == NULL) || (val == NULL)) { LOGERR("NULL key or value"); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } // NOTE: assumes rank value fits in 10 characters @@ -971,7 +1021,7 @@ int unifyfs_keyval_publish_remote(const char* key, if (len >= (kv_max_keylen - 1)) { LOGERR("length of key (%zd) exceeds max %zd", len, kv_max_keylen); - return (int)UNIFYFS_ERROR_INVAL; + return EINVAL; } // generate full key, which includes remote host @@ -980,10 +1030,12 @@ int unifyfs_keyval_publish_remote(const char* key, snprintf(rank_key, sizeof(rank_key), "%d.%s", kv_myrank, key); // publish it - #if defined(USE_PMIX) +#if defined(USE_PMIX) rc = unifyfs_pmix_publish(rank_key, val); #elif defined(USE_PMI2) rc = unifyfs_pmi2_publish(rank_key, val); +#else + rc = (int)UNIFYFS_FAILURE; #endif if (rc != (int)UNIFYFS_SUCCESS) { rc = unifyfs_fskv_publish_remote(key, val); @@ -996,3 +1048,17 @@ int unifyfs_keyval_publish_remote(const char* key, } return rc; } + +// block until a particular key-value pair published by all servers +int unifyfs_keyval_fence_remote(void) +{ + int rc; +#if defined(USE_PMIX) + rc = unifyfs_pmix_fence(); +#elif defined(USE_PMI2) + rc = unifyfs_pmi2_fence(); +#else + rc = unifyfs_fskv_fence(); +#endif + return rc; +} diff --git a/common/src/unifyfs_keyval.h b/common/src/unifyfs_keyval.h index c98cfedae..d56e884cd 100644 --- a/common/src/unifyfs_keyval.h +++ b/common/src/unifyfs_keyval.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -22,11 +22,10 @@ extern "C" { #endif // keys we use -const char* key_runstate; // path to runstate file -const char* key_unifyfsd_socket; // server domain socket path -const char* key_unifyfsd_margo_shm; // client-server margo address -const char* key_unifyfsd_margo_svr; // server-server margo address -const char* key_unifyfsd_mpi_rank; // server-server MPI rank +extern const char* const key_unifyfsd_socket; // server domain socket path +extern const char* const key_unifyfsd_margo_shm; // client-server margo address +extern const char* const key_unifyfsd_margo_svr; // server-server margo address +extern const char* const key_unifyfsd_pmi_rank; // server-server pmi rank // initialize key-value store int unifyfs_keyval_init(unifyfs_cfg_t* cfg, @@ -53,6 +52,9 @@ int unifyfs_keyval_lookup_remote(int rank, const char* key, char** oval); +// block until a particular key-value pair published by all servers +int unifyfs_keyval_fence_remote(void); + #ifdef __cplusplus } // extern "C" #endif diff --git a/common/src/unifyfs_log.c b/common/src/unifyfs_log.c index 294409e36..478593807 100644 --- a/common/src/unifyfs_log.c +++ b/common/src/unifyfs_log.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -29,11 +29,15 @@ #include #include +#include +#include +#include + #include "unifyfs_log.h" #include "unifyfs_const.h" /* one of the loglevel values */ -unifyfs_log_level_t unifyfs_log_level = 5; +unifyfs_log_level_t unifyfs_log_level = LOG_ERR; /* pointer to log file stream */ FILE* unifyfs_log_stream; // = NULL @@ -43,11 +47,12 @@ time_t unifyfs_log_time; struct tm* unifyfs_log_ltime; char unifyfs_log_timestamp[256]; -/* used to reduce source file name length */ +/* used to reduce source file pathname length */ size_t unifyfs_log_source_base_len; // = 0 static const char* this_file = __FILE__; /* open specified file as log file stream, + * or stderr if no file given. * returns UNIFYFS_SUCCESS on success */ int unifyfs_log_open(const char* file) { @@ -59,30 +64,55 @@ int unifyfs_log_open(const char* file) } } - FILE* logf = fopen(file, "a"); - if (logf == NULL) { - /* failed to open file name, fall back to stderr */ + if (NULL == unifyfs_log_stream) { + /* stderr is the default log stream */ unifyfs_log_stream = stderr; - return (int)UNIFYFS_ERROR_DBG; - } else { - unifyfs_log_stream = logf; - return UNIFYFS_SUCCESS; } + + if (NULL != file) { + FILE* logf = fopen(file, "a"); + if (logf == NULL) { + return ENOENT; + } else { + unifyfs_log_stream = logf; + } + } + + return (int)UNIFYFS_SUCCESS; } -/* close our log file stream, +/* close our log file stream. * returns UNIFYFS_SUCCESS on success */ int unifyfs_log_close(void) { - if (unifyfs_log_stream == NULL) { - /* nothing to close */ - return (int)UNIFYFS_ERROR_DBG; - } else { - /* if stream is open, and its not stderr, close it */ - if (unifyfs_log_stream != stderr && - fclose(unifyfs_log_stream) == 0) { - return UNIFYFS_SUCCESS; + /* if stream is open, and its not stderr, close it */ + if (NULL != unifyfs_log_stream) { + if (unifyfs_log_stream != stderr) { + fclose(unifyfs_log_stream); + + /* revert to stderr for any future log messages */ + unifyfs_log_stream = stderr; } - return (int)UNIFYFS_ERROR_DBG; } + return (int)UNIFYFS_SUCCESS; +} + +/* set log level */ +void unifyfs_set_log_level(unifyfs_log_level_t lvl) +{ + if (lvl < LOG_LEVEL_MAX) { + unifyfs_log_level = lvl; + } +} + +pid_t unifyfs_gettid(void) +{ +#if defined(gettid) + return gettid(); +#elif defined(SYS_gettid) + return syscall(SYS_gettid); +#else +#error no gettid() +#endif + return 0; } diff --git a/common/src/unifyfs_log.h b/common/src/unifyfs_log.h index 391d0f1f7..ee0160d53 100644 --- a/common/src/unifyfs_log.h +++ b/common/src/unifyfs_log.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -32,9 +32,8 @@ #include #include -#include -#include #include +#include #ifdef __cplusplus extern "C" { @@ -45,7 +44,8 @@ typedef enum { LOG_ERR = 2, LOG_WARN = 3, LOG_INFO = 4, - LOG_DBG = 5 + LOG_DBG = 5, + LOG_LEVEL_MAX } unifyfs_log_level_t; extern unifyfs_log_level_t unifyfs_log_level; @@ -55,13 +55,7 @@ extern struct tm* unifyfs_log_ltime; extern char unifyfs_log_timestamp[256]; extern size_t unifyfs_log_source_base_len; -#if defined(__NR_gettid) -#define gettid() syscall(__NR_gettid) -#elif defined(SYS_gettid) -#define gettid() syscall(SYS_gettid) -#else -#error gettid syscall is not defined -#endif +pid_t unifyfs_gettid(void); #define LOG(level, ...) \ if (level <= unifyfs_log_level) { \ @@ -74,7 +68,7 @@ extern size_t unifyfs_log_source_base_len; unifyfs_log_stream = stderr; \ } \ fprintf(unifyfs_log_stream, "%s tid=%ld @ %s() [%s:%d] ", \ - unifyfs_log_timestamp, (long)gettid(), \ + unifyfs_log_timestamp, (long)unifyfs_gettid(), \ __func__, srcfile, __LINE__); \ fprintf(unifyfs_log_stream, __VA_ARGS__); \ fprintf(unifyfs_log_stream, "\n"); \ @@ -83,6 +77,7 @@ extern size_t unifyfs_log_source_base_len; #define LOGERR(...) LOG(LOG_ERR, __VA_ARGS__) #define LOGWARN(...) LOG(LOG_WARN, __VA_ARGS__) +#define LOGINFO(...) LOG(LOG_INFO, __VA_ARGS__) #define LOGDBG(...) LOG(LOG_DBG, __VA_ARGS__) /* open specified file as debug file stream, @@ -93,6 +88,9 @@ int unifyfs_log_open(const char* file); * returns UNIFYFS_SUCCESS on success */ int unifyfs_log_close(void); +/* set log level */ +void unifyfs_set_log_level(unifyfs_log_level_t lvl); + #ifdef __cplusplus } // extern "C" #endif diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c new file mode 100644 index 000000000..648418c91 --- /dev/null +++ b/common/src/unifyfs_logio.c @@ -0,0 +1,848 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "unifyfs_log.h" +#include "unifyfs_logio.h" +#include "unifyfs_meta.h" +#include "unifyfs_shm.h" +#include "slotmap.h" + +#define LOGIO_SHMEM_FMTSTR "logio_mem.%d.%d" +#define LOGIO_SPILL_FMTSTR "%s/logio_spill.%d.%d" + + +/* log-based I/O header - first page of shmem region or spill file */ +typedef struct log_header { + size_t data_sz; /* total data bytes in log */ + size_t reserved_sz; /* reserved data bytes */ + size_t chunk_sz; /* data chunk size */ + size_t max_reserved_slot; /* slot index for last reserved chunk */ + off_t data_offset; /* file/memory offset where data chunks start */ +} log_header; +/* chunk slot_map immediately follows header and occupies rest of the page */ +// slot_map chunk_map; /* chunk slot_map that tracks reservations */ + +static inline +slot_map* log_header_to_chunkmap(log_header* hdr) +{ + char* hdrp = (char*) hdr; + return (slot_map*)(hdrp + sizeof(log_header)); +} + +/* method to get page size once, then re-use it */ +size_t get_page_size(void) +{ + static size_t page_sz; // = 0 + if (0 == page_sz) { + page_sz = (size_t) getpagesize(); + } + return page_sz; +} + +/* calculate number of chunks needed for requested bytes */ +static inline +size_t bytes_to_chunks(size_t bytes, size_t chunk_sz) +{ + size_t n_chunks = bytes / chunk_sz; + if (bytes % chunk_sz) { + n_chunks++; + } + return n_chunks; +} + +/* determine shmem and spill chunk allocations based on log offset */ +static inline +void get_log_sizes(off_t log_offset, + size_t nbytes, + size_t shmem_data_sz, + size_t* sz_in_mem, + size_t* sz_in_spill, + off_t* spill_offset) +{ + assert((NULL != sz_in_mem) && + (NULL != sz_in_spill) && + (NULL != spill_offset)); + + *sz_in_mem = 0; + *sz_in_spill = 0; + *spill_offset = 0; + + if ((log_offset + (off_t)nbytes) <= shmem_data_sz) { + /* data fully in shared memory */ + *sz_in_mem = nbytes; + } else if (log_offset < shmem_data_sz) { + /* requested data spans shared memory and spillover file */ + *sz_in_mem = (size_t)(shmem_data_sz - log_offset); + *sz_in_spill = nbytes - *sz_in_mem; + } else { + /* requested data is totally in spillover file */ + *sz_in_spill = nbytes; + *spill_offset = log_offset - shmem_data_sz; + } +} + +/* open (or create) spill file at path and set its size */ +static int get_spillfile(const char* path, + const size_t spill_sz) +{ + /* try to create the spill file */ + mode_t perms = unifyfs_getmode(0640); + int spill_fd = open(path, O_RDWR | O_CREAT | O_EXCL, perms); + if (spill_fd < 0) { + if (errno == EEXIST) { + /* already exists - try simple open */ + spill_fd = open(path, O_RDWR); + } else { + int err = errno; + LOGERR("open(%s) failed: %s", path, strerror(err)); + } + } else { + /* new spillover block created, set its size */ + int rc = ftruncate(spill_fd, (off_t)spill_sz); + if (rc < 0) { + int err = errno; + LOGERR("ftruncate() failed: %s", strerror(err)); + } + } + return spill_fd; +} + +/* map log header (1st page) of spill file given by file descriptor */ +static void* map_spillfile(int spill_fd, int mmap_prot) +{ + size_t pgsz = get_page_size(); + void* addr = mmap(NULL, pgsz, mmap_prot, MAP_SHARED, spill_fd, 0); + if (NULL == addr) { + int err = errno; + LOGERR("mmap(fd=%d, sz=%zu, MAP_SHARED) failed - %s", + spill_fd, pgsz, strerror(err)); + } + return addr; +} + +/* Initialize logio context for server */ +int unifyfs_logio_init_server(const int app_id, + const int client_id, + const size_t mem_size, + const size_t spill_size, + const char* spill_dir, + logio_context** pctx) +{ + if (NULL == pctx) { + return EINVAL; + } + *pctx = NULL; + + shm_context* shm_ctx = NULL; + if (mem_size) { + /* attach to client shmem region */ + char shm_name[SHMEM_NAME_LEN] = {0}; + snprintf(shm_name, sizeof(shm_name), LOGIO_SHMEM_FMTSTR, + app_id, client_id); + shm_ctx = unifyfs_shm_alloc(shm_name, mem_size); + if (NULL == shm_ctx) { + LOGERR("Failed to attach logio shmem buffer!"); + return UNIFYFS_ERROR_SHMEM; + } + } + + char spillfile[UNIFYFS_MAX_FILENAME]; + void* spill_mapping = NULL; + int spill_fd = -1; + if (spill_size) { + if (NULL == spill_dir) { + LOGERR("Spill directory not given!"); + return EINVAL; + } + + /* open the spill over file */ + snprintf(spillfile, sizeof(spillfile), LOGIO_SPILL_FMTSTR, + spill_dir, app_id, client_id); + spill_fd = get_spillfile(spillfile, spill_size); + if (spill_fd < 0) { + LOGERR("Failed to open logio spill file!"); + return UNIFYFS_FAILURE; + } else { + /* map first page of the spill over file, which contains log header + * and chunk slot_map. server only needs read access. */ + spill_mapping = map_spillfile(spill_fd, PROT_READ); + if (NULL == spill_mapping) { + LOGERR("Failed to map logio spill file header!"); + return UNIFYFS_FAILURE; + } + } + } + + logio_context* ctx = (logio_context*) calloc(1, sizeof(logio_context)); + if (NULL == ctx) { + LOGERR("Failed to allocate logio context!"); + return ENOMEM; + } + ctx->shmem = shm_ctx; + ctx->spill_hdr = spill_mapping; + ctx->spill_fd = spill_fd; + ctx->spill_sz = spill_size; + if (spill_size) { + ctx->spill_file = strdup(spillfile); + } + *pctx = ctx; + + return UNIFYFS_SUCCESS; +} + + +/* initialize the log header page for given log region and size + * (note: intended for client use only) */ +static int init_log_header(char* log_region, + size_t region_size, + size_t chunk_size) +{ + size_t pgsz = get_page_size(); + + /* TODO: need to think about how to support client re-attach */ + + /* log header structure resides at start of log region */ + log_header* hdr = (log_header*) log_region; + + /* zero all log header fields */ + memset(log_region, 0, sizeof(log_header)); + + /* chunk data starts after header page */ + size_t data_size = region_size - pgsz; + hdr->data_sz = data_size; + hdr->chunk_sz = chunk_size; + hdr->data_offset = (off_t)pgsz; + + /* initialize chunk slot map (immediately follows header in memory) */ + char* slotmap = log_region + sizeof(log_header); + size_t slotmap_size = pgsz - sizeof(log_header); + size_t n_chunks = data_size / chunk_size; + slot_map* chunkmap = slotmap_init(n_chunks, (void*)slotmap, slotmap_size); + if (NULL == chunkmap) { + LOGERR("Failed to initialize chunk slotmap @ %p (sz=%zu, #chunks=%zu)", + slotmap, slotmap_size, n_chunks); + return UNIFYFS_FAILURE; + } + + return UNIFYFS_SUCCESS; +} + +/* Initialize logio for client */ +int unifyfs_logio_init_client(const int app_id, + const int client_id, + const unifyfs_cfg_t* client_cfg, + logio_context** pctx) +{ + char* cfgval; + int rc; + + if ((NULL == client_cfg) || (NULL == pctx)) { + return EINVAL; + } + *pctx = NULL; + + /* determine max memory bytes for chunk storage */ + size_t memlog_size = 0; + cfgval = client_cfg->logio_shmem_size; + if (cfgval != NULL) { + long l; + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + memlog_size = (size_t)l; + } + } + + /* get chunk size from config */ + size_t chunk_size = UNIFYFS_LOGIO_CHUNK_SIZE; + cfgval = client_cfg->logio_chunk_size; + if (cfgval != NULL) { + long l; + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + chunk_size = (size_t)l; + } + } + + shm_context* shm_ctx = NULL; + if (memlog_size) { + /* allocate logio shared memory buffer */ + char shm_name[SHMEM_NAME_LEN] = {0}; + snprintf(shm_name, sizeof(shm_name), LOGIO_SHMEM_FMTSTR, + app_id, client_id); + shm_ctx = unifyfs_shm_alloc(shm_name, memlog_size); + if (NULL == shm_ctx) { + LOGERR("Failed to create logio shmem buffer!"); + return UNIFYFS_ERROR_SHMEM; + } + + /* initialize shmem log header */ + char* memlog = (char*) shm_ctx->addr; + rc = init_log_header(memlog, memlog_size, chunk_size); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("Failed to initialize shmem logio header"); + return rc; + } + } + + /* will we use spillover to store the files? */ + size_t spill_size = 0; + cfgval = client_cfg->logio_spill_size; + if (cfgval != NULL) { + long l; + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + spill_size = (size_t)l; + } + } + int unifyfs_use_spillover = 0; + if (spill_size > 0) { + LOGDBG("using spillover - size = %zu B", spill_size); + unifyfs_use_spillover = 1; + } + + void* spill_mapping = NULL; + int spill_fd = -1; + if (unifyfs_use_spillover) { + /* get directory in which to create spill over files */ + cfgval = client_cfg->logio_spill_dir; + if (NULL == cfgval) { + LOGERR("UNIFYFS_LOGIO_SPILL_DIR configuration not set! " + "Set to an existing writable path (e.g., /mnt/ssd)"); + return UNIFYFS_ERROR_BADCONFIG; + } + + /* define path to the spill over file for data chunks */ + char spillfile[UNIFYFS_MAX_FILENAME]; + snprintf(spillfile, sizeof(spillfile), LOGIO_SPILL_FMTSTR, + cfgval, app_id, client_id); + + /* create the spill over file */ + spill_fd = get_spillfile(spillfile, spill_size); + if (spill_fd < 0) { + LOGERR("Failed to open logio spill file!"); + return UNIFYFS_FAILURE; + } else { + /* map first page of the spill over file, which contains log header + * and chunk slot_map. client needs read and write access. */ + spill_mapping = map_spillfile(spill_fd, PROT_READ|PROT_WRITE); + if (NULL == spill_mapping) { + LOGERR("Failed to map logio spill file header!"); + return UNIFYFS_FAILURE; + } + + /* initialize spill log header */ + char* spill = (char*) spill_mapping; + rc = init_log_header(spill, spill_size, chunk_size); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("Failed to initialize shmem logio header"); + return rc; + } + } + } + + logio_context* ctx = (logio_context*) calloc(1, sizeof(logio_context)); + if (NULL == ctx) { + LOGERR("Failed to allocate logio context!"); + return ENOMEM; + } + ctx->shmem = shm_ctx; + ctx->spill_hdr = spill_mapping; + ctx->spill_fd = spill_fd; + ctx->spill_sz = spill_size; + *pctx = ctx; + + return UNIFYFS_SUCCESS; +} + +/* Close logio context */ +int unifyfs_logio_close(logio_context* ctx, + int clean_spill) +{ + if (NULL == ctx) { + return EINVAL; + } + + int rc; + if (NULL != ctx->shmem) { + /* release shmem region */ + rc = unifyfs_shm_free(&(ctx->shmem)); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("Failed to release logio shmem region!"); + } + } + + if (ctx->spill_sz) { + if (NULL != ctx->spill_hdr) { + /* unmap log header page */ + rc = munmap(ctx->spill_hdr, get_page_size()); + if (rc != 0) { + int err = errno; + LOGERR("Failed to unmap logio spill file header (errno=%s)", + strerror(err)); + } + ctx->spill_hdr = NULL; + } + if (-1 != ctx->spill_fd) { + /* close spill file */ + rc = close(ctx->spill_fd); + if (rc != 0) { + int err = errno; + LOGERR("Failed to close logio spill file (errno=%s)", + strerror(err)); + } + ctx->spill_fd = -1; + } + if (clean_spill && (ctx->spill_file != NULL)) { + rc = unlink(ctx->spill_file); + if (rc != 0) { + int err = errno; + LOGERR("Failed to unlink logio spill file %s (errno=%s)", + ctx->spill_file, strerror(err)); + } + free(ctx->spill_file); + } + } + + /* free the context struct */ + free(ctx); + + return UNIFYFS_SUCCESS; +} + +/* Allocate write space from logio context */ +int unifyfs_logio_alloc(logio_context* ctx, + const size_t nbytes, + off_t* log_offset) +{ + if ((NULL == ctx) || + ((nbytes > 0) && (NULL == log_offset))) { + return EINVAL; + } + + if (0 == nbytes) { + LOGWARN("zero bytes allocated from log!"); + return UNIFYFS_SUCCESS; + } + + size_t chunk_sz = 0; + size_t allocated_bytes = 0; + size_t needed_bytes = nbytes; + size_t needed_chunks; + size_t res_chunks; + ssize_t res_slot; + off_t res_off = -1; + + size_t mem_res_slot = 0; + size_t mem_res_nchk = 0; + int mem_res_at_end = 0; + size_t mem_allocation = 0; + + log_header* shmem_hdr = NULL; + log_header* spill_hdr = NULL; + slot_map* chunkmap; + + if (NULL != ctx->shmem) { + /* get shmem log header and chunk slotmap */ + shmem_hdr = (log_header*) ctx->shmem->addr; + chunkmap = log_header_to_chunkmap(shmem_hdr); + + /* calculate number of chunks needed for requested bytes */ + chunk_sz = shmem_hdr->chunk_sz; + needed_chunks = bytes_to_chunks(needed_bytes, chunk_sz); + + /* try to reserve all chunks from shmem */ + res_chunks = needed_chunks; + res_slot = slotmap_reserve(chunkmap, res_chunks); + if (-1 != res_slot) { + /* success, all needed chunks allocated in shmem */ + allocated_bytes = res_chunks * chunk_sz; + shmem_hdr->reserved_sz += allocated_bytes; + shmem_hdr->max_reserved_slot = (res_slot + res_chunks) - 1; + res_off = (off_t)(res_slot * chunk_sz); + *log_offset = res_off; + return UNIFYFS_SUCCESS; + } + + /* could not get full allocation in shmem, reserve any available + * chunks at the end of the shmem log */ + size_t log_end_chunks = chunkmap->total_slots - + (shmem_hdr->max_reserved_slot + 1); + if (log_end_chunks > 0) { + res_chunks = log_end_chunks; + res_slot = slotmap_reserve(chunkmap, res_chunks); + if (-1 != res_slot) { + /* reserved all chunks at end of shmem log */ + allocated_bytes = res_chunks * chunk_sz; + needed_bytes -= allocated_bytes; + res_off = (off_t)(res_slot * chunk_sz); + mem_allocation = allocated_bytes; + mem_res_slot = res_slot; + mem_res_nchk = res_chunks; + mem_res_at_end = 1; + } + } + } + + if (NULL != ctx->spill_hdr) { + /* get spill log header and chunk slotmap */ + spill_hdr = (log_header*) ctx->spill_hdr; + chunkmap = log_header_to_chunkmap(spill_hdr); + + /* calculate number of chunks needed for remaining bytes */ + chunk_sz = spill_hdr->chunk_sz; + needed_chunks = bytes_to_chunks(needed_bytes, chunk_sz); + + /* reserve the rest of the chunks from spill file */ + res_chunks = needed_chunks; + res_slot = slotmap_reserve(chunkmap, res_chunks); + if (-1 != res_slot) { + allocated_bytes = res_chunks * chunk_sz; + if (0 == mem_res_at_end) { + /* success, full reservation in spill */ + spill_hdr->reserved_sz += allocated_bytes; + spill_hdr->max_reserved_slot = (res_slot + res_chunks) - 1; + res_off = (off_t)(res_slot * chunk_sz); + if (NULL != shmem_hdr) { + /* update log offset to account for shmem log size */ + res_off += shmem_hdr->data_sz; + } + *log_offset = res_off; + return UNIFYFS_SUCCESS; + } else { + /* if we have an allocation from end of shmem log, make sure + * spill allocation starts at first chunk (slot=0) */ + if (res_slot != 0) { + /* incompatible shmem and spill reservations, release both + * and try to get the full allocation from spill */ + + /* release the spill chunks we just got */ + int rc = slotmap_release(chunkmap, res_slot, res_chunks); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("slotmap_release() for logio shmem failed"); + } + + /* release the shmem chunks */ + chunkmap = log_header_to_chunkmap(shmem_hdr); + rc = slotmap_release(chunkmap, mem_res_slot, mem_res_nchk); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("slotmap_release() for logio shmem failed"); + } + mem_res_slot = 0; + mem_res_nchk = 0; + mem_allocation = 0; + + /* try again with full reservation in spill */ + chunkmap = log_header_to_chunkmap(spill_hdr); + needed_chunks = bytes_to_chunks(nbytes, chunk_sz); + res_chunks = needed_chunks; + res_slot = slotmap_reserve(chunkmap, res_chunks); + if (-1 != res_slot) { + /* success, full reservation in spill */ + spill_hdr->reserved_sz += allocated_bytes; + spill_hdr->max_reserved_slot = + (res_slot + res_chunks) - 1; + res_off = (off_t)(res_slot * chunk_sz); + if (NULL != shmem_hdr) { + /* update log offset to include shmem log size */ + res_off += shmem_hdr->data_sz; + } + *log_offset = res_off; + return UNIFYFS_SUCCESS; + } + } else { + /* successful reservation spanning shmem and spill */ + shmem_hdr->reserved_sz += mem_allocation; + shmem_hdr->max_reserved_slot = + (mem_res_slot + mem_res_nchk) - 1; + spill_hdr->reserved_sz += allocated_bytes; + spill_hdr->max_reserved_slot = (res_slot + res_chunks) - 1; + *log_offset = res_off; + return UNIFYFS_SUCCESS; + } + } + } + } + + /* can't fulfill request from spill file, roll back any prior + * shmem reservation and return ENOSPC */ + if (mem_res_nchk) { + chunkmap = log_header_to_chunkmap(shmem_hdr); + int rc = slotmap_release(chunkmap, mem_res_slot, mem_res_nchk); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("slotmap_release() for logio shmem failed"); + } + } + return ENOSPC; +} + +/* Release previously allocated write space from logio context */ +int unifyfs_logio_free(logio_context* ctx, + const off_t log_offset, + const size_t nbytes) +{ + if (NULL == ctx) { + return EINVAL; + } + + if (0 == nbytes) { + LOGWARN("zero bytes freed from log!"); + return UNIFYFS_SUCCESS; + } + + log_header* shmem_hdr = NULL; + log_header* spill_hdr = NULL; + slot_map* chunkmap; + + off_t mem_size = 0; + if (NULL != ctx->shmem) { + shmem_hdr = (log_header*) ctx->shmem->addr; + mem_size = (off_t) shmem_hdr->data_sz; + } + + /* determine chunk allocations based on log offset */ + size_t sz_in_mem = 0; + size_t sz_in_spill = 0; + off_t spill_offset = 0; + get_log_sizes(log_offset, nbytes, mem_size, + &sz_in_mem, &sz_in_spill, &spill_offset); + + int rc = UNIFYFS_SUCCESS; + size_t chunk_sz, chunk_slot, num_chunks; + if (sz_in_mem > 0) { + /* release shared memory chunks */ + chunk_sz = shmem_hdr->chunk_sz; + chunk_slot = log_offset / chunk_sz; + num_chunks = bytes_to_chunks(sz_in_mem, chunk_sz); + chunkmap = log_header_to_chunkmap(shmem_hdr); + rc = slotmap_release(chunkmap, chunk_slot, num_chunks); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("slotmap_release() for logio shmem failed"); + } + } + if (sz_in_spill > 0) { + /* release spill chunks */ + spill_hdr = (log_header*) ctx->spill_hdr; + chunk_sz = spill_hdr->chunk_sz; + chunk_slot = spill_offset / chunk_sz; + num_chunks = bytes_to_chunks(sz_in_spill, chunk_sz); + chunkmap = log_header_to_chunkmap(spill_hdr); + rc = slotmap_release(chunkmap, chunk_slot, num_chunks); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("slotmap_release() for logio spill failed"); + } + } + return rc; +} + +/* Read data from logio context */ +int unifyfs_logio_read(logio_context* ctx, + const off_t log_offset, + const size_t nbytes, + char* obuf, + size_t* obytes) +{ + if ((NULL == ctx) || + ((nbytes > 0) && (NULL == obuf))) { + return EINVAL; + } + + if (NULL != obytes) { + *obytes = 0; + } + + if (0 == nbytes) { + LOGWARN("zero bytes read from log!"); + return UNIFYFS_SUCCESS; + } + + log_header* shmem_hdr = NULL; + off_t mem_size = 0; + if (NULL != ctx->shmem) { + shmem_hdr = (log_header*) ctx->shmem->addr; + mem_size = (off_t) shmem_hdr->data_sz; + } + + /* prepare read operations based on log offset */ + size_t nread = 0; + size_t sz_in_mem = 0; + size_t sz_in_spill = 0; + off_t spill_offset = 0; + get_log_sizes(log_offset, nbytes, mem_size, + &sz_in_mem, &sz_in_spill, &spill_offset); + + /* do reads */ + int err_rc = 0; + if (sz_in_mem > 0) { + /* read data from shared memory */ + char* shmem_data = (char*)(ctx->shmem->addr) + shmem_hdr->data_offset; + char* log_ptr = shmem_data + log_offset; + memcpy(obuf, log_ptr, sz_in_mem); + nread += sz_in_mem; + } + if (sz_in_spill > 0) { + log_header* spill_hdr = (log_header*) ctx->spill_hdr; + spill_offset += spill_hdr->data_offset; + + /* read data from spillover file */ + ssize_t rc = pread(ctx->spill_fd, (obuf + sz_in_mem), + sz_in_spill, spill_offset); + if (-1 == rc) { + err_rc = errno; + LOGERR("pread(spillfile) failed: %s", strerror(err_rc)); + } else { + nread += rc; + } + } + + if (nread) { + if (nread != nbytes) { + LOGDBG("partial log read: %zu of %zu bytes", nread, nbytes); + } + if (NULL != obytes) { + *obytes = nread; + } + return UNIFYFS_SUCCESS; + } else { + return err_rc; + } +} + +/* Write data to logio context */ +int unifyfs_logio_write(logio_context* ctx, + const off_t log_offset, + const size_t nbytes, + const char* ibuf, + size_t* obytes) +{ + if ((NULL == ctx) || + ((nbytes > 0) && (NULL == ibuf))) { + return EINVAL; + } + + if (NULL != obytes) { + *obytes = 0; + } + + if (0 == nbytes) { + LOGWARN("zero bytes written to log!"); + return UNIFYFS_SUCCESS; + } + + log_header* shmem_hdr = NULL; + off_t mem_size = 0; + if (NULL != ctx->shmem) { + shmem_hdr = (log_header*) ctx->shmem->addr; + mem_size = (off_t) shmem_hdr->data_sz; + } + + /* prepare write operations based on log offset */ + size_t nwrite = 0; + size_t sz_in_mem = 0; + size_t sz_in_spill = 0; + off_t spill_offset = 0; + get_log_sizes(log_offset, nbytes, mem_size, + &sz_in_mem, &sz_in_spill, &spill_offset); + + /* do writes */ + int err_rc = 0; + if (sz_in_mem > 0) { + /* write data to shared memory */ + char* shmem_data = (char*)(ctx->shmem->addr) + shmem_hdr->data_offset; + char* log_ptr = shmem_data + log_offset; + memcpy(log_ptr, ibuf, sz_in_mem); + nwrite += sz_in_mem; + } + if (sz_in_spill > 0) { + log_header* spill_hdr = (log_header*) ctx->spill_hdr; + spill_offset += spill_hdr->data_offset; + + /* write data to spillover file */ + ssize_t rc = pwrite(ctx->spill_fd, (ibuf + sz_in_mem), + sz_in_spill, spill_offset); + if (-1 == rc) { + err_rc = errno; + LOGERR("pwrite(spillfile) failed: %s", strerror(err_rc)); + } else { + nwrite += rc; + } + } + + /* update output parameter if we wrote anything */ + if (nwrite) { + if (nwrite != nbytes) { + LOGDBG("partial log write: %zu of %zu bytes", nwrite, nbytes); + } + + if (NULL != obytes) { + /* obytes is set to the number of bytes actually written */ + *obytes = nwrite; + } + return UNIFYFS_SUCCESS; + } else { + return err_rc; + } +} + +/* Sync any spill data to disk for given logio context */ +int unifyfs_logio_sync(logio_context* ctx) +{ + if ((ctx->spill_sz) && (-1 != ctx->spill_fd)) { + /* fsync spill file */ + int rc = fsync(ctx->spill_fd); + if (rc != 0) { + int err = errno; + LOGERR("Failed to fsync logio spill file (errno=%s)", + strerror(err)); + return err; + } + } + return UNIFYFS_SUCCESS; +} + +/* Get the shmem and spill data sizes */ +int unifyfs_logio_get_sizes(logio_context* ctx, + off_t* shmem_sz, + off_t* spill_sz) +{ + if (NULL == ctx) { + return EINVAL; + } + + if (NULL != shmem_sz) { + *shmem_sz = 0; + if (NULL != ctx->shmem) { + log_header* shmem_hdr = (log_header*) ctx->shmem->addr; + *shmem_sz = (off_t) shmem_hdr->data_sz; + } + } + + if (NULL != spill_sz) { + *spill_sz = 0; + if (NULL != ctx->spill_hdr) { + log_header* spill_hdr = (log_header*) ctx->spill_hdr; + *spill_sz = (off_t) spill_hdr->data_sz; + } + } + + return UNIFYFS_SUCCESS; +} diff --git a/common/src/unifyfs_logio.h b/common/src/unifyfs_logio.h new file mode 100644 index 000000000..15364e3df --- /dev/null +++ b/common/src/unifyfs_logio.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_LOGIO_H +#define UNIFYFS_LOGIO_H + +#include + +#include "unifyfs_configurator.h" +#include "unifyfs_shm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* log-based I/O context structure */ +typedef struct logio_context { + shm_context* shmem; /* shmem region for memory storage */ + void* spill_hdr; /* mmap() address for spillover file log header */ + char* spill_file; /* pathname of spillover file */ + size_t spill_sz; /* size of spillover file */ + int spill_fd; /* spillover file descriptor */ +} logio_context; + +/** + * Initialize logio context for server. + * + * @param app_id application id + * @param client_id client id + * @param mem_sz shared memory region size for storing data + * @param spill_sz spillfile size for storing data + * @param spill_dir path to spillfile parent directory + * @param[out] ctx address of logio context pointer, set to new context + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_init_server(const int app_id, + const int client_id, + const size_t mem_sz, + const size_t spill_sz, + const char* spill_dir, + logio_context** ctx); + +/** + * Initialize logio context for client. + * + * @param app_id application id + * @param client_id client id + * @param client_cfg pointer to client configuration + * @param[out] ctx address of logio context pointer, set to new context + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_init_client(const int app_id, + const int client_id, + const unifyfs_cfg_t* client_cfg, + logio_context** ctx); + +/** + * Close logio context. + * + * @param ctx pointer to logio context + * @param clean_spill set to non-zero to have server remove spill file + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_close(logio_context* ctx, + int clean_spill); + +/** + * Allocate write space from logio context. + * + * @param ctx pointer to logio context + * @param nbytes size of allocation in bytes + * @param[out] log_offset set to log offset to use for writing + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_alloc(logio_context* ctx, + const size_t nbytes, + off_t* log_offset); + +/** + * Release previously allocated write space from logio context. + * + * @param ctx pointer to logio context + * @param log_offset log offset of allocation to release + * @param nbytes size of allocation in bytes + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_free(logio_context* ctx, + const off_t log_offset, + const size_t nbytes); + +/** + * Read data from logio context at given log offset. + * + * @param ctx pointer to logio context + * @param log_offset log offset to read from + * @param nbytes number of bytes to read + * @param buf destination data buffer + * @param[out] obytes set to number of bytes actually read + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_read(logio_context* ctx, + const off_t log_offset, + const size_t nbytes, + char* buf, + size_t* obytes); + +/** + * Write data to logio context at given log offset. + * + * @param ctx pointer to logio context + * @param log_offset log offset to write to + * @param nbytes number of bytes to write + * @param buf source data buffer + * @param[out] obytes set to number of bytes actually written + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_write(logio_context* ctx, + const off_t log_offset, + const size_t nbytes, + const char* buf, + size_t* obytes); + +/** + * Sync any spill data to disk for given logio context. + * + * @param ctx pointer to logio context + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_sync(logio_context* ctx); + +/** + * Get the shmem and spill data sizes. + * + * @param ctx pointer to logio context + * @param[out] shmem_sz if non-NULL, set to size of shmem data storage + * @param[out] spill_sz if non-NULL, set to size of spillover data storage + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_logio_get_sizes(logio_context* ctx, + off_t* shmem_sz, + off_t* spill_sz); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* UNIFYFS_LOGIO_H */ diff --git a/common/src/unifyfs_meta.c b/common/src/unifyfs_meta.c new file mode 100644 index 000000000..d1fa5adcf --- /dev/null +++ b/common/src/unifyfs_meta.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include + +#include "unifyfs_meta.h" + +/* extent slice size used for metadata */ +size_t meta_slice_sz = META_DEFAULT_RANGE_SZ; + +/* calculate number of slices in an extent given by start offset and length */ +size_t meta_num_slices(size_t offset, size_t length) +{ + size_t start = offset / meta_slice_sz; + size_t end = (offset + length - 1) / meta_slice_sz; + size_t count = end - start + 1; + return count; +} + +/** + * Hash a file path to a 64-bit unsigned integer using MD5 + * @param path absolute file path + * @return hash value + */ +uint64_t compute_path_md5(const char* path) +{ + unsigned long len; + unsigned char digested[16] = {0}; + + len = strlen(path); + MD5((const unsigned char*) path, len, digested); + + /* construct uint64_t hash from first 8 digest bytes */ + uint64_t hash = be64toh(*((uint64_t*)digested)); + return hash; +} diff --git a/common/src/unifyfs_meta.h b/common/src/unifyfs_meta.h index ea5a159b6..24d5dc460 100644 --- a/common/src/unifyfs_meta.h +++ b/common/src/unifyfs_meta.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2018, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2018, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -15,36 +15,58 @@ #ifndef UNIFYFS_META_H #define UNIFYFS_META_H +#include +#include #include +#include +#include +#include #include #include -#include #include #include "unifyfs_const.h" +#include "unifyfs_log.h" #ifdef __cplusplus extern "C" { #endif -/** - * Server commands - */ -typedef enum { - COMM_MOUNT, - COMM_META_FSYNC, - COMM_META_GET, - COMM_META_SET, - COMM_READ, - COMM_UNMOUNT, - COMM_DIGEST, - COMM_SYNC_DEL, -} cmd_lst_t; +/* extent slice size used for metadata */ +extern size_t meta_slice_sz; + +/* calculate number of slices in an extent given by start offset and length */ +size_t meta_num_slices(size_t offset, size_t length); + +/* structure used to detect clients/servers colocated on a host */ +typedef struct { + char hostname[UNIFYFS_MAX_HOSTNAME]; + int rank; +} name_rank_pair_t; + +/* generic file extent */ +typedef struct { + size_t offset; + size_t length; + int gfid; +} unifyfs_extent_t; + +/* write-log metadata index structure */ +typedef struct { + off_t file_pos; /* start offset of data in file */ + off_t log_pos; /* start offset of data in write log */ + size_t length; /* length of data */ + int gfid; /* global file id */ +} unifyfs_index_t; + +/* UnifyFS file attributes */ typedef struct { - int fid; + char* filename; int gfid; - char filename[UNIFYFS_MAX_FILENAME]; + + /* Set when the file is laminated */ + int is_laminated; /* essential stat fields */ uint32_t mode; /* st_mode bits */ @@ -54,9 +76,6 @@ typedef struct { struct timespec atime; struct timespec mtime; struct timespec ctime; - - /* Set when the file is laminated */ - uint32_t is_laminated; } unifyfs_file_attr_t; enum { @@ -66,10 +85,151 @@ enum { UNIFYFS_STAT_DEFAULT_DIR_MODE = S_IFDIR | 0755, }; +static inline +int unifyfs_file_attr_set_invalid(unifyfs_file_attr_t* attr) +{ + if (!attr) { + return EINVAL; + } + + memset(attr, 0, sizeof(*attr)); + attr->filename = NULL; + attr->gfid = -1; + attr->is_laminated = -1; + attr->mode = -1; + attr->uid = -1; + attr->gid = -1; + attr->size = (uint64_t) -1; + + return 0; +} + +static inline +void debug_print_file_attr(unifyfs_file_attr_t* attr) +{ + if (!attr) { + return; + } + LOGDBG("fileattr(%p) - gfid=%d filename=%s laminated=%d", + attr, attr->gfid, attr->filename, attr->is_laminated); + LOGDBG(" - sz=%zu mode=%o uid=%d gid=%d", + (size_t)attr->size, attr->mode, attr->uid, attr->gid); + LOGDBG(" - atime=%ld.%09ld ctime=%ld.%09ld mtime=%ld.%09ld", + attr->atime.tv_sec, attr->atime.tv_nsec, + attr->ctime.tv_sec, attr->ctime.tv_nsec, + attr->mtime.tv_sec, attr->mtime.tv_nsec); +} + +typedef enum { + UNIFYFS_FILE_ATTR_OP_INVALID = 0, + UNIFYFS_FILE_ATTR_OP_CHGRP, + UNIFYFS_FILE_ATTR_OP_CHMOD, + UNIFYFS_FILE_ATTR_OP_CHOWN, + UNIFYFS_FILE_ATTR_OP_CREATE, + UNIFYFS_FILE_ATTR_OP_DATA, + UNIFYFS_FILE_ATTR_OP_LAMINATE, + UNIFYFS_FILE_ATTR_OP_TRUNCATE +} unifyfs_file_attr_op_e; + +/* + * updates @dst with new values from @src. + * ignores fields from @src with negative values. + */ +static inline +int unifyfs_file_attr_update(int attr_op, + unifyfs_file_attr_t* dst, + unifyfs_file_attr_t* src) +{ + if (!dst || !src + || (attr_op == UNIFYFS_FILE_ATTR_OP_INVALID) + || (dst->gfid != src->gfid)) { + return EINVAL; + } + + LOGDBG("updating attributes for gfid=%d", dst->gfid); + + /* Update fields only with valid values and associated operation. + * invalid values are set by unifyfs_file_attr_set_invalid() above */ + + if ((src->mode != -1) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CHMOD) || + (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_LAMINATE))) { + LOGDBG("setting mode to %o", src->mode); + dst->mode = src->mode; + } + + if ((src->uid != -1) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CHOWN) || + (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE))) { + dst->uid = src->uid; + } + + if ((src->gid != -1) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CHGRP) || + (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE))) { + dst->gid = src->gid; + } + + if ((src->size != (uint64_t)-1) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_DATA) || + (attr_op == UNIFYFS_FILE_ATTR_OP_LAMINATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_TRUNCATE))) { + LOGDBG("setting attr.size to %" PRIu64, src->size); + dst->size = src->size; + } + + if ((src->atime.tv_sec != 0) && + (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE)) { + LOGDBG("setting attr.atime to %d.%09ld", + (int)src->atime.tv_sec, src->atime.tv_nsec); + dst->atime = src->atime; + } + + if ((src->mtime.tv_sec != 0) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_DATA) || + (attr_op == UNIFYFS_FILE_ATTR_OP_LAMINATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_TRUNCATE))) { + LOGDBG("setting attr.mtime to %d.%09ld", + (int)src->mtime.tv_sec, src->mtime.tv_nsec); + dst->mtime = src->mtime; + } + + if ((src->ctime.tv_sec != 0) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CHGRP) || + (attr_op == UNIFYFS_FILE_ATTR_OP_CHMOD) || + (attr_op == UNIFYFS_FILE_ATTR_OP_CHOWN) || + (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_DATA) || + (attr_op == UNIFYFS_FILE_ATTR_OP_LAMINATE))) { + LOGDBG("setting attr.ctime to %d.%09ld", + (int)src->ctime.tv_sec, src->ctime.tv_nsec); + dst->ctime = src->ctime; + } + + if ((src->is_laminated != -1) && + ((attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) || + (attr_op == UNIFYFS_FILE_ATTR_OP_LAMINATE))) { + LOGDBG("setting attr.is_laminated to %d", src->is_laminated); + dst->is_laminated = src->is_laminated; + } + + if (src->filename && !dst->filename) { + LOGDBG("setting attr.filename to %s", src->filename); + dst->filename = strdup(src->filename); + } + + return 0; +} + static inline void unifyfs_file_attr_to_stat(unifyfs_file_attr_t* fattr, struct stat* sb) { if (fattr && sb) { + debug_print_file_attr(fattr); + sb->st_dev = UNIFYFS_STAT_DEFAULT_DEV; sb->st_ino = fattr->gfid; sb->st_mode = fattr->mode; @@ -77,6 +237,9 @@ void unifyfs_file_attr_to_stat(unifyfs_file_attr_t* fattr, struct stat* sb) sb->st_gid = fattr->gid; sb->st_rdev = UNIFYFS_STAT_DEFAULT_DEV; sb->st_size = fattr->size; + + /* TODO: use cfg.logio_chunk_size here for st_blksize + * and report acutal chunks allocated for st_blocks */ sb->st_blksize = UNIFYFS_STAT_DEFAULT_BLKSIZE; sb->st_blocks = fattr->size / UNIFYFS_STAT_DEFAULT_BLKSIZE; if (fattr->size % UNIFYFS_STAT_DEFAULT_BLKSIZE > 0) { @@ -90,51 +253,86 @@ void unifyfs_file_attr_to_stat(unifyfs_file_attr_t* fattr, struct stat* sb) */ sb->st_nlink = fattr->is_laminated ? 1 : 0; - sb->st_atime = fattr->atime.tv_sec; - sb->st_mtime = fattr->mtime.tv_sec; - sb->st_ctime = fattr->ctime.tv_sec; + sb->st_atim = fattr->atime; + sb->st_mtim = fattr->mtime; + sb->st_ctim = fattr->ctime; } } -typedef struct { - off_t file_pos; - off_t mem_pos; - size_t length; - int fid; -} unifyfs_index_t; +/* given an input mode, mask it with umask and return. + * set perms=0 to request all read/write bits */ +static inline +mode_t unifyfs_getmode(mode_t perms) +{ + /* perms == 0 is shorthand for all read and write bits */ + if (perms == 0) { + perms = 0666; + } -/* Header for read request reply in client shared memory region. - * The associated data payload immediately follows the header in - * the shmem region. - * offset - offset within file - * length - data size - * gfid - global file id - * errcode - read error code (zero on success) */ -typedef struct { - size_t offset; - size_t length; - int gfid; - int errcode; -} shm_meta_t; + /* get current user mask */ + mode_t mask = umask(0); + umask(mask); -/* State values for client shared memory region */ -typedef enum { - SHMEM_REGION_EMPTY = 0, // set by client to indicate drain complete - SHMEM_REGION_DATA_READY = 1, // set by server to initiate client drain - SHMEM_REGION_DATA_COMPLETE = 2 // set by server when done writing data -} shm_region_state_e; - -/* Header for client shared memory region. - * sync - for synchronizing updates/access by server threads - * meta_cnt - number of shm_meta_t (i.e., read replies) currently in shmem - * bytes - total bytes of shmem region in use (shm_meta_t + payloads) - * state - region state variable used for client-server coordination */ - typedef struct { - pthread_mutex_t sync; - volatile size_t meta_cnt; - volatile size_t bytes; - volatile shm_region_state_e state; -} shm_header_t; + /* mask off bits from desired permissions */ + mode_t ret = (perms & 0777) & ~mask; + return ret; +} + +/* qsort comparison function for name_rank_pair_t */ +static inline +int compare_name_rank_pair(const void* a, const void* b) +{ + const name_rank_pair_t* pair_a = (const name_rank_pair_t*) a; + const name_rank_pair_t* pair_b = (const name_rank_pair_t*) b; + + /* compare the hostnames */ + int cmp = strcmp(pair_a->hostname, pair_b->hostname); + if (0 == cmp) { + /* if hostnames are the same, compare the rank */ + cmp = pair_a->rank - pair_b->rank; + } + return cmp; +} + +/* qsort comparison function for int */ +static inline +int compare_int(const void* a, const void* b) +{ + int aval = *(const int*)a; + int bval = *(const int*)b; + return aval - bval; +} + + +/* + * Hash a file path to a uint64_t using MD5 + * @param path absolute file path + * @return hash value + */ +uint64_t compute_path_md5(const char* path); + +/* + * Hash a file path to an integer gfid + * @param path absolute file path + * @return gfid + */ +static inline +int unifyfs_generate_gfid(const char* path) +{ + /* until we support 64-bit gfids, use top 32 bits */ + uint64_t hash64 = compute_path_md5(path); + uint32_t hash32 = (uint32_t)(hash64 >> 32); + + /* TODO: Remove next statement once we get rid of MDHIM. + * + * MDHIM requires positive values for integer keys, due to the way + * slice servers are calculated. We use an integer key for the + * gfid -> file attributes index. To guarantee a positive value, we + * shift right one bit to make sure the top bit is zero. */ + hash32 = hash32 >> 1; + + return (int)hash32; +} #ifdef __cplusplus } // extern "C" diff --git a/common/src/unifyfs_misc.c b/common/src/unifyfs_misc.c new file mode 100644 index 000000000..345daa172 --- /dev/null +++ b/common/src/unifyfs_misc.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* + * This file contains miscellaneous common functions that don't fit into a + * particular common/src/ file. + */ +#include +#include +#include + +/* + * Re-implementation of BSD's strlcpy() function. + * + * This is a basically a safer version of strlncpy() since it always + * NULL-terminates the buffer. Google 'strlcpy' for full documentation. + */ +size_t strlcpy(char* dest, const char* src, size_t size) +{ + size_t src_len; + + src_len = strnlen(src, size); + if (src_len == size) { + /* Our string is too long, have to truncate */ + src_len = size - 1; + } + + memcpy(dest, src, src_len); + dest[src_len] = '\0'; + + return strlen(dest); +} + +/* + * This is a re-implementation of the Linux kernel's scnprintf() function. + * + * It's snprintf() but returns the number of chars actually written into buf[] + * not including the '\0'. It also avoids the -Wformat-truncation warnings. + */ +int scnprintf(char* buf, size_t size, const char* fmt, ...) +{ + va_list args; + int rc; + + va_start(args, fmt); + rc = vsnprintf(buf, size, fmt, args); + va_end(args); + + if (rc >= size) { + /* We truncated */ + return size - 1; + } + + return rc; +} diff --git a/common/src/unifyfs_misc.h b/common/src/unifyfs_misc.h new file mode 100644 index 000000000..3678c2ec6 --- /dev/null +++ b/common/src/unifyfs_misc.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __UNIFYFS_MISC__ +#define __UNIFYFS_MISC__ + +size_t strlcpy(char* dest, const char* src, size_t size); + +int scnprintf(char* buf, size_t size, const char* fmt, ...); + +#endif diff --git a/common/src/err_enumerator.c b/common/src/unifyfs_rc.c similarity index 53% rename from common/src/err_enumerator.c rename to common/src/unifyfs_rc.c index 8f978f175..5bb7f61ca 100644 --- a/common/src/err_enumerator.c +++ b/common/src/unifyfs_rc.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -34,21 +34,24 @@ * MIT License - See LICENSE.tedium */ -#include "err_enumerator.h" -#include +#include #include +#include #include +#include "unifyfs_rc.h" + + /* c-strings for enum names */ -#define ENUMITEM(name, desc) \ - const char *UNIFYFS_ERROR_ ## name ## _NAME_STR = #name; +#define ENUMITEM(name, desc) \ + const char* UNIFYFS_ERROR_ ## name ## _NAME_STR = #name; UNIFYFS_ERROR_ENUMERATOR #undef ENUMITEM -const char *unifyfs_error_enum_str(unifyfs_error_e e) +const char* unifyfs_rc_enum_str(unifyfs_rc rc) { - switch (e) { + switch (rc) { case UNIFYFS_FAILURE: return "UNIFYFS_FAILURE"; case UNIFYFS_SUCCESS: @@ -56,7 +59,7 @@ const char *unifyfs_error_enum_str(unifyfs_error_e e) #define ENUMITEM(name, desc) \ case UNIFYFS_ERROR_ ## name: \ return UNIFYFS_ERROR_ ## name ## _NAME_STR; - UNIFYFS_ERROR_ENUMERATOR +UNIFYFS_ERROR_ENUMERATOR #undef ENUMITEM default : break; @@ -66,14 +69,16 @@ const char *unifyfs_error_enum_str(unifyfs_error_e e) /* c-strings for enum descriptions */ -#define ENUMITEM(name, desc) \ - const char *UNIFYFS_ERROR_ ## name ## _DESC_STR = #desc; +#define ENUMITEM(name, desc) \ + const char* UNIFYFS_ERROR_ ## name ## _DESC_STR = #desc; UNIFYFS_ERROR_ENUMERATOR #undef ENUMITEM -const char *unifyfs_error_enum_description(unifyfs_error_e e) +char posix_errstr[1024]; + +const char* unifyfs_rc_enum_description(unifyfs_rc rc) { - switch (e) { + switch (rc) { case UNIFYFS_FAILURE: return "Failure"; case UNIFYFS_SUCCESS: @@ -81,33 +86,55 @@ const char *unifyfs_error_enum_description(unifyfs_error_e e) #define ENUMITEM(name, desc) \ case UNIFYFS_ERROR_ ## name: \ return UNIFYFS_ERROR_ ## name ## _DESC_STR; - UNIFYFS_ERROR_ENUMERATOR +UNIFYFS_ERROR_ENUMERATOR #undef ENUMITEM - default : - break; + default: + /* assume it's a POSIX errno value */ + snprintf(posix_errstr, sizeof(posix_errstr), "%s", + strerror((int)rc)); + return (const char*)posix_errstr; } return NULL; } -unifyfs_error_e unifyfs_error_enum_from_str(const char *s) +unifyfs_rc unifyfs_rc_enum_from_str(const char* s) { - if (0) - ; + if (strcmp(s, "Success") == 0) { + return UNIFYFS_SUCCESS; + } else if (strcmp(s, "Failure") == 0) { + return UNIFYFS_FAILURE; + } #define ENUMITEM(name, desc) \ - else if (strcmp(s, #name) == 0) \ - return UNIFYFS_ERROR_ ## name; - UNIFYFS_ERROR_ENUMERATOR; + else if (strcmp(s, #name) == 0) { \ + return UNIFYFS_ERROR_ ## name; \ + } +UNIFYFS_ERROR_ENUMERATOR #undef ENUMITEM - return UNIFYFS_INVALID_ERROR; + return UNIFYFS_INVALID_RC; } /* validity check */ - -int check_valid_unifyfs_error_enum(unifyfs_error_e e) +int check_valid_unifyfs_rc_enum(unifyfs_rc rc) { - return ((e > UNIFYFS_INVALID_ERROR) && - (e < UNIFYFS_ERROR_MAX) && - (unifyfs_error_enum_str(e) != NULL)); + return ((rc > UNIFYFS_INVALID_RC) && + (rc < UNIFYFS_END_ERRORS) && + (unifyfs_rc_enum_str(rc) != NULL)); } +/* convert to an errno value */ +int unifyfs_rc_errno(unifyfs_rc rc) +{ + if (rc == UNIFYFS_SUCCESS) { + return 0; + } else if (rc == UNIFYFS_INVALID_RC) { + return EINVAL; + } else if ((rc == UNIFYFS_FAILURE) || + ((rc > UNIFYFS_BEGIN_ERRORS) && (rc < UNIFYFS_END_ERRORS))) { + /* none of our custom errors have good errno counterparts, use EIO */ + return EIO; + } else { + /* should be a normal errno value already */ + return (int)rc; + } +} diff --git a/common/src/unifyfs_rc.h b/common/src/unifyfs_rc.h new file mode 100644 index 000000000..cf54ad904 --- /dev/null +++ b/common/src/unifyfs_rc.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* Copyright (c) 2018 - Michael J. Brim + * + * Enumerator is part of https://github.com/MichaelBrim/tedium + * + * MIT License - See LICENSE.tedium + */ + +#ifndef _UNIFYFS_RC_ENUMERATOR_H_ +#define _UNIFYFS_RC_ENUMERATOR_H_ + +#include + +/* #define __ELASTERROR if our errno.h doesn't define it for us */ +#ifndef __ELASTERROR +#define __ELASTERROR 1000 +#endif + +/* NOTE: If POSIX errno.h defines an error code that we can use sensibly, + * don't create a duplicate one for UnifyFS */ + +/** + * @brief enumerator list expanded many times with varied ENUMITEM() definitions + * + * @param item name + * @param item short description + */ +#define UNIFYFS_ERROR_ENUMERATOR \ + ENUMITEM(BADCONFIG, "Configuration has invalid setting") \ + ENUMITEM(GOTCHA, "Gotcha operation error") \ + ENUMITEM(KEYVAL, "Key-value store operation error") \ + ENUMITEM(MARGO, "Mercury/Argobots operation error") \ + ENUMITEM(MDHIM, "MDHIM operation error") \ + ENUMITEM(META, "Metadata store operation error") \ + ENUMITEM(NYI, "Not yet implemented") \ + ENUMITEM(PMI, "PMI2/PMIx error") \ + ENUMITEM(SHMEM, "Shared memory region init/access error") \ + ENUMITEM(THRDINIT, "Thread initialization failed") \ + ENUMITEM(TIMEOUT, "Timed out") \ + + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * @brief enum for UnifyFS return codes + */ +typedef enum { + UNIFYFS_INVALID_RC = -2, + UNIFYFS_FAILURE = -1, + UNIFYFS_SUCCESS = 0, + /* Start our error numbers after the standard errno.h ones */ + UNIFYFS_BEGIN_ERRORS = __ELASTERROR, +#define ENUMITEM(name, desc) \ + UNIFYFS_ERROR_ ## name, +UNIFYFS_ERROR_ENUMERATOR +#undef ENUMITEM + UNIFYFS_END_ERRORS +} unifyfs_rc; + +/** + * @brief get C-string for given error enum value + */ +const char* unifyfs_rc_enum_str(unifyfs_rc rc); + +/** + * @brief get description for given error enum value + */ +const char* unifyfs_rc_enum_description(unifyfs_rc rc); + +/** + * @brief check validity of given error enum value + */ +int check_valid_unifyfs_rc_enum(unifyfs_rc rc); + +/** + * @brief get enum value for given error C-string + */ +unifyfs_rc unifyfs_rc_enum_from_str(const char* s); + +/** + * @brief convert a UnifyFS error to an errno value + */ +int unifyfs_rc_errno(unifyfs_rc rc); + +#ifdef __cplusplus +} /* extern C */ +#endif + +#endif /* UNIFYFS_RC_ENUMERATOR_H */ diff --git a/common/src/unifyfs_rpc_types.h b/common/src/unifyfs_rpc_types.h new file mode 100644 index 000000000..126ffdd3e --- /dev/null +++ b/common/src/unifyfs_rpc_types.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __UNIFYFS_RPC_TYPES_H +#define __UNIFYFS_RPC_TYPES_H + +#include +#include +#include + +#include "unifyfs_meta.h" + +/* rpc encode/decode for timespec structs */ +typedef struct timespec sys_timespec_t; +MERCURY_GEN_STRUCT_PROC(sys_timespec_t, + ((uint64_t)(tv_sec)) + ((uint64_t)(tv_nsec)) +) + +/* rpc encode/decode for unifyfs_file_attr_t */ +MERCURY_GEN_STRUCT_PROC(unifyfs_file_attr_t, + ((int32_t)(gfid)) + ((int32_t)(is_laminated)) + ((uint32_t)(mode)) + ((uint32_t)(uid)) + ((uint32_t)(gid)) + ((hg_size_t)(size)) + ((sys_timespec_t)(atime)) + ((sys_timespec_t)(ctime)) + ((sys_timespec_t)(mtime)) + ((hg_const_string_t)(filename)) +) + +#endif /* __UNIFYFS_RPC_TYPES_H */ diff --git a/common/src/unifyfs_rpc_util.c b/common/src/unifyfs_rpc_util.c index 7a60725a9..5cc99bb4f 100644 --- a/common/src/unifyfs_rpc_util.c +++ b/common/src/unifyfs_rpc_util.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/unifyfs_rpc_util.h b/common/src/unifyfs_rpc_util.h index 8067ed5c6..e18a51025 100644 --- a/common/src/unifyfs_rpc_util.h +++ b/common/src/unifyfs_rpc_util.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2018, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2018, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/common/src/unifyfs_runstate.c b/common/src/unifyfs_runstate.c deleted file mode 100644 index 660b7e693..000000000 --- a/common/src/unifyfs_runstate.c +++ /dev/null @@ -1,97 +0,0 @@ -#include -#include -#include -#include -#include - -#include "unifyfs_keyval.h" -#include "unifyfs_log.h" -#include "unifyfs_runstate.h" - -const char* runstate_file = "unifyfs-runstate.conf"; - -int unifyfs_read_runstate(unifyfs_cfg_t* cfg, - const char* runstate_path) -{ - int rc = (int)UNIFYFS_SUCCESS; - int uid = (int)getuid(); - char runstate_fname[UNIFYFS_MAX_FILENAME] = {0}; - - if (cfg == NULL) { - LOGERR("NULL config"); - return (int)UNIFYFS_ERROR_INVAL; - } - - if (runstate_path == NULL) { - if (cfg->runstate_dir == NULL) { - LOGERR("bad runstate dir config setting"); - return (int)UNIFYFS_ERROR_APPCONFIG; - } - snprintf(runstate_fname, sizeof(runstate_fname), - "%s/%s.%d", cfg->runstate_dir, runstate_file, uid); - } else { - snprintf(runstate_fname, sizeof(runstate_fname), - "%s", runstate_path); - } - - if (unifyfs_config_process_ini_file(cfg, runstate_fname) != 0) { - LOGERR("failed to process runstate file %s", runstate_fname); - rc = (int)UNIFYFS_ERROR_APPCONFIG; - } - - return rc; -} - -int unifyfs_write_runstate(unifyfs_cfg_t* cfg) -{ - int rc = (int)UNIFYFS_SUCCESS; - int uid = (int)getuid(); - FILE* runstate_fp = NULL; - char runstate_fname[UNIFYFS_MAX_FILENAME] = {0}; - - if (cfg == NULL) { - LOGERR("NULL config"); - return (int)UNIFYFS_ERROR_INVAL; - } - - snprintf(runstate_fname, sizeof(runstate_fname), - "%s/%s.%d", cfg->runstate_dir, runstate_file, uid); - - runstate_fp = fopen(runstate_fname, "w"); - if (runstate_fp == NULL) { - LOGERR("failed to create file %s", runstate_fname); - rc = (int)UNIFYFS_ERROR_FILE; - } else { - if ((unifyfs_log_stream != NULL) && - (unifyfs_log_level >= LOG_INFO)) { - unifyfs_config_print(cfg, unifyfs_log_stream); - } - unifyfs_config_print_ini(cfg, runstate_fp); - fclose(runstate_fp); - } - - return rc; -} - -int unifyfs_clean_runstate(unifyfs_cfg_t* cfg) -{ - int rc = (int)UNIFYFS_SUCCESS; - int uid = (int)getuid(); - char runstate_fname[UNIFYFS_MAX_FILENAME] = {0}; - - if (cfg == NULL) { - LOGERR("invalid config arg"); - return (int)UNIFYFS_ERROR_INVAL; - } - - snprintf(runstate_fname, sizeof(runstate_fname), - "%s/%s.%d", cfg->runstate_dir, runstate_file, uid); - - rc = unlink(runstate_fname); - if (rc != 0) { - LOGERR("failed to remove file %s", runstate_fname); - rc = (int)UNIFYFS_ERROR_FILE; - } - - return rc; -} diff --git a/common/src/unifyfs_runstate.h b/common/src/unifyfs_runstate.h deleted file mode 100644 index e32c67372..000000000 --- a/common/src/unifyfs_runstate.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _UNIFYFS_RUNSTATE_H_ -#define _UNIFYFS_RUNSTATE_H_ - -#include "unifyfs_configurator.h" - -#ifdef __cplusplus -extern "C" { -#endif - -int unifyfs_read_runstate(unifyfs_cfg_t* cfg, - const char* runstate_path); - -int unifyfs_write_runstate(unifyfs_cfg_t* cfg); - -int unifyfs_clean_runstate(unifyfs_cfg_t* cfg); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // UNIFYFS_RUNSTATE_H diff --git a/common/src/unifyfs_server_rpcs.h b/common/src/unifyfs_server_rpcs.h index fd86f674d..349e1f11b 100644 --- a/common/src/unifyfs_server_rpcs.h +++ b/common/src/unifyfs_server_rpcs.h @@ -1,3 +1,17 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + #ifndef __UNIFYFS_SERVER_RPCS_H #define __UNIFYFS_SERVER_RPCS_H @@ -5,41 +19,29 @@ * Declarations for server-server margo RPCs */ +#include #include #include #include #include +#include "unifyfs_rpc_types.h" + #ifdef __cplusplus extern "C" { #endif -/* server_hello_rpc (server => server) - * - * say hello from one server to another */ -MERCURY_GEN_PROC(server_hello_in_t, - ((int32_t)(src_rank)) - ((hg_const_string_t)(message_str))) -MERCURY_GEN_PROC(server_hello_out_t, - ((int32_t)(ret))) -DECLARE_MARGO_RPC_HANDLER(server_hello_rpc) +/*---- Server Point-to-Point (p2p) RPCs ----*/ -/* server_request_rpc (server => server) - * - * request from one server to another */ -MERCURY_GEN_PROC(server_request_in_t, - ((int32_t)(src_rank)) - ((int32_t)(req_id)) - ((int32_t)(req_tag)) - ((hg_size_t)(bulk_size)) - ((hg_bulk_t)(bulk_handle))) -MERCURY_GEN_PROC(server_request_out_t, +/* Report server pid to rank 0 */ +MERCURY_GEN_PROC(server_pid_in_t, + ((int32_t)(rank)) + ((int32_t)(pid))) +MERCURY_GEN_PROC(server_pid_out_t, ((int32_t)(ret))) -DECLARE_MARGO_RPC_HANDLER(server_request_rpc) +DECLARE_MARGO_RPC_HANDLER(server_pid_rpc) -/* chunk_read_request_rpc (server => server) - * - * request for chunk reads from another server */ +/* Chunk read request */ MERCURY_GEN_PROC(chunk_read_request_in_t, ((int32_t)(src_rank)) ((int32_t)(app_id)) @@ -52,9 +54,7 @@ MERCURY_GEN_PROC(chunk_read_request_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(chunk_read_request_rpc) -/* chunk_read_response_rpc (server => server) - * - * response to remote chunk reads request */ +/* Chunk read response */ MERCURY_GEN_PROC(chunk_read_response_in_t, ((int32_t)(src_rank)) ((int32_t)(app_id)) @@ -67,6 +67,119 @@ MERCURY_GEN_PROC(chunk_read_response_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(chunk_read_response_rpc) +/* Add file extents at owner */ +MERCURY_GEN_PROC(add_extents_in_t, + ((int32_t)(src_rank)) + ((int32_t)(gfid)) + ((int32_t)(num_extents)) + ((hg_bulk_t)(extents))) +MERCURY_GEN_PROC(add_extents_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(add_extents_rpc) + +/* Find file extent locations by querying owner */ +MERCURY_GEN_PROC(find_extents_in_t, + ((int32_t)(src_rank)) + ((int32_t)(gfid)) + ((int32_t)(num_extents)) + ((hg_bulk_t)(extents))) +MERCURY_GEN_PROC(find_extents_out_t, + ((int32_t)(num_locations)) + ((hg_bulk_t)(locations)) + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(find_extents_rpc) + +/* Get file size from owner */ +MERCURY_GEN_PROC(filesize_in_t, + ((int32_t)(gfid))) +MERCURY_GEN_PROC(filesize_out_t, + ((hg_size_t)(filesize)) + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(filesize_rpc) + +/* Laminate file at owner */ +MERCURY_GEN_PROC(laminate_in_t, + ((int32_t)(gfid))) +MERCURY_GEN_PROC(laminate_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(laminate_rpc) + +/* Get file metadata from owner */ +MERCURY_GEN_PROC(metaget_in_t, + ((int32_t)(gfid))) +MERCURY_GEN_PROC(metaget_out_t, + ((unifyfs_file_attr_t)(attr)) + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(metaget_rpc) + +/* Set file metadata at owner */ +MERCURY_GEN_PROC(metaset_in_t, + ((int32_t)(gfid)) + ((int32_t)(fileop)) + ((unifyfs_file_attr_t)(attr))) +MERCURY_GEN_PROC(metaset_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(metaset_rpc) + +/* Truncate file at owner */ +MERCURY_GEN_PROC(truncate_in_t, + ((int32_t)(gfid)) + ((hg_size_t)(filesize))) +MERCURY_GEN_PROC(truncate_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(truncate_rpc) + +/*---- Collective RPCs ----*/ + +/* Broadcast file extents to all servers */ +MERCURY_GEN_PROC(extent_bcast_in_t, + ((int32_t)(root)) + ((int32_t)(gfid)) + ((int32_t)(num_extents)) + ((hg_bulk_t)(extents))) +MERCURY_GEN_PROC(extent_bcast_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(extent_bcast_rpc) + +/* Broadcast file metadata to all servers */ +MERCURY_GEN_PROC(fileattr_bcast_in_t, + ((int32_t)(root)) + ((int32_t)(gfid)) + ((int32_t)(attrop)) + ((unifyfs_file_attr_t)(attr))) +MERCURY_GEN_PROC(fileattr_bcast_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(fileattr_bcast_rpc) + +/* Broadcast laminated file metadata to all servers */ +MERCURY_GEN_PROC(laminate_bcast_in_t, + ((int32_t)(root)) + ((int32_t)(gfid)) + ((int32_t)(num_extents)) + ((unifyfs_file_attr_t)(attr)) + ((hg_bulk_t)(extents))) +MERCURY_GEN_PROC(laminate_bcast_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(laminate_bcast_rpc) + +/* Broadcast truncation point to all servers */ +MERCURY_GEN_PROC(truncate_bcast_in_t, + ((int32_t)(root)) + ((int32_t)(gfid)) + ((hg_size_t)(filesize))) +MERCURY_GEN_PROC(truncate_bcast_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(truncate_bcast_rpc) + +/* Broadcast unlink to all servers */ +MERCURY_GEN_PROC(unlink_bcast_in_t, + ((int32_t)(root)) + ((int32_t)(gfid))) +MERCURY_GEN_PROC(unlink_bcast_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unlink_bcast_rpc) + + #ifdef __cplusplus } // extern "C" #endif diff --git a/common/src/unifyfs_shm.c b/common/src/unifyfs_shm.c index 75d63a4c2..61fed37c5 100644 --- a/common/src/unifyfs_shm.c +++ b/common/src/unifyfs_shm.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,25 +11,27 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include -#include -#include #include +#include +#include #include +#include #include #include #include -#include -#include "unifyfs_log.h" #include "unifyfs_const.h" +#include "unifyfs_log.h" +#include "unifyfs_shm.h" -/* TODO: same function exists in client code, move this to common */ -/* creates a shared memory of given size under specified name, - * returns address of new shared memory if successful, - * returns NULL on error */ -void* unifyfs_shm_alloc(const char* name, size_t size) +/* Allocate a shared memory region with given name and size, + * and map it into memory. + * Returns a pointer to shm_context for region if successful, + * or NULL on error */ +shm_context* unifyfs_shm_alloc(const char* name, size_t size) { int ret; @@ -38,29 +40,34 @@ void* unifyfs_shm_alloc(const char* name, size_t size) int fd = shm_open(name, O_RDWR | O_CREAT, 0770); if (fd == -1) { /* failed to open shared memory */ - LOGERR("Failed to open shared memory %s errno=%d (%s)", - name, errno, strerror(errno)); + LOGERR("Failed to open shared memory %s (%s)", + name, strerror(errno)); return NULL; } /* set size of shared memory region */ #ifdef HAVE_POSIX_FALLOCATE - ret = posix_fallocate(fd, 0, size); - if (ret != 0) { - /* failed to set size shared memory */ - errno = ret; - LOGERR("posix_fallocate failed for %s errno=%d (%s)", - name, errno, strerror(errno)); - close(fd); - return NULL; - } + do { /* this loop handles syscall interruption for large allocations */ + int try_count = 0; + ret = posix_fallocate(fd, 0, size); + if (ret != 0) { + /* failed to set size shared memory */ + try_count++; + if ((ret != EINTR) || (try_count >= 5)) { + LOGERR("posix_fallocate failed for %s (%s)", + name, strerror(ret)); + close(fd); + return NULL; + } + } + } while (ret != 0); #else errno = 0; ret = ftruncate(fd, size); if (ret == -1) { /* failed to set size of shared memory */ - LOGERR("ftruncate failed for %s errno=%d (%s)", - name, errno, strerror(errno)); + LOGERR("ftruncate failed for %s (%s)", + name, strerror(errno)); close(fd); return NULL; } @@ -72,8 +79,8 @@ void* unifyfs_shm_alloc(const char* name, size_t size) fd, 0); if (addr == MAP_FAILED) { /* failed to open shared memory */ - LOGERR("Failed to mmap shared memory %s errno=%d (%s)", - name, errno, strerror(errno)); + LOGERR("Failed to mmap shared memory %s (%s)", + name, strerror(errno)); close(fd); return NULL; } @@ -82,60 +89,87 @@ void* unifyfs_shm_alloc(const char* name, size_t size) errno = 0; ret = close(fd); if (ret == -1) { - /* failed to open shared memory */ - LOGERR("Failed to mmap shared memory %s errno=%d (%s)", - name, errno, strerror(errno)); + /* failed to close shared memory */ + LOGERR("Failed to close shared memory fd %d (%s)", + fd, strerror(errno)); /* not fatal, so keep going */ } - /* return address */ - return addr; + /* return pointer to new shm_context */ + shm_context* ctx = (shm_context*) calloc(1, sizeof(shm_context)); + if (NULL != ctx) { + snprintf(ctx->name, sizeof(ctx->name), "%s", name); + ctx->addr = addr; + ctx->size = size; + } + return ctx; } -/* unmaps shared memory region from memory, and releases it, - * caller should provide the address of a pointer to the region - * in paddr, sets paddr to NULL on return, - * returns UNIFYFS_SUCCESS on success */ -int unifyfs_shm_free(const char* name, size_t size, void** paddr) +/* Unmaps shared memory region and frees its context. + * The shm_context pointer is set to NULL on success. + * Returns UNIFYFS_SUCCESS on success, or error code */ +int unifyfs_shm_free(shm_context** pctx) { /* check that we got an address (to something) */ - if (paddr == NULL) { - return UNIFYFS_FAILURE; + if (pctx == NULL) { + return EINVAL; + } + + shm_context* ctx = *pctx; + if (ctx == NULL) { + return EINVAL; } /* get address of shared memory region */ - void* addr = *paddr; + void* addr = ctx->addr; - /* if we have a pointer, try to munmap and unlink it */ + /* if we have a pointer, try to munmap it */ if (addr != NULL) { /* unmap shared memory from memory space */ errno = 0; - int rc = munmap(addr, size); + int rc = munmap(addr, ctx->size); if (rc == -1) { /* failed to unmap shared memory */ - LOGERR("Failed to unmap shared memory %s errno=%d (%s)", - name, errno, strerror(errno)); - - /* not fatal, so keep going */ - } - - /* release our reference to the shared memory region */ - errno = 0; - rc = shm_unlink(name); - if (rc == -1) { int err = errno; - if (ENOENT != err) { - /* failed to remove shared memory */ - LOGERR("Failed to unlink shared memory %s errno=%d (%s)", - name, err, strerror(err)); - } + LOGERR("Failed to unmap shared memory %s (%s)", + ctx->name, strerror(err)); + /* not fatal, so keep going */ } } + /* free shmem context structure */ + free(ctx); + /* set caller's pointer to NULL */ - *paddr = NULL; + *pctx = NULL; + + return UNIFYFS_SUCCESS; +} + +/* Unlinks file used to attach to a shared memory region. + * Once unlinked, no other processes may attach. + * Returns UNIFYFS_SUCCESS on success, or error code. */ +int unifyfs_shm_unlink(shm_context* ctx) +{ + /* check context pointer */ + if (ctx == NULL) { + return EINVAL; + } + + /* unlink file naming the shared memory region */ + errno = 0; + int rc = shm_unlink(ctx->name); + if (rc == -1) { + int err = errno; + if (ENOENT != err) { + /* failed to remove shared memory */ + LOGERR("Failed to unlink shared memory %s (%s)", + ctx->name, strerror(err)); + } + /* not fatal, so keep going */ + } return UNIFYFS_SUCCESS; } diff --git a/common/src/unifyfs_shm.h b/common/src/unifyfs_shm.h index 33afe6a19..4c593f210 100644 --- a/common/src/unifyfs_shm.h +++ b/common/src/unifyfs_shm.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2018, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2018, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -15,20 +15,89 @@ #ifndef UNIFYFS_SHM_H #define UNIFYFS_SHM_H +#include +#include + +#include "unifyfs_rc.h" + +/* Set default region name len if not already defined */ +#ifndef SHMEM_NAME_LEN +#define SHMEM_NAME_LEN 64 +#endif + +/* printf() format strings used by both client and server to name shared + * memory regions. First %d is application id, second is client id. */ +#define SHMEM_DATA_FMTSTR "%d-data-%d" +#define SHMEM_SUPER_FMTSTR "%d-super-%d" + #ifdef __cplusplus extern "C" { #endif -/* allocate and attach a named shared memory region of a particular size - * and mmap into our memory, returns starting memory address on success, - * returns NULL on failure */ -void* unifyfs_shm_alloc(const char* name, size_t size); +/* Header for read-request reply in client shared memory region. + * The associated data payload immediately follows the header in + * the shmem region. + * offset - offset within file + * length - data size + * gfid - global file id + * errcode - read error code (zero on success) */ +typedef struct shm_data_meta { + size_t offset; + size_t length; + int gfid; + int errcode; +} shm_data_meta; + +/* State values for client shared memory region */ +typedef enum { + SHMEM_REGION_EMPTY = 0, // set by client to indicate drain complete + SHMEM_REGION_DATA_READY = 1, // set by server to initiate client drain + SHMEM_REGION_DATA_COMPLETE = 2 // set by server when done writing data +} shm_data_state_e; + +/* Header for client shared memory region. + * sync - for synchronizing updates/access by server threads + * meta_cnt - number of shm_data_meta (i.e., read replies) currently in shmem + * bytes - total bytes of shmem region in use (shm_data_meta + payloads) + * state - region state variable used for client-server coordination */ + typedef struct shm_data_header { + pthread_mutex_t sync; + volatile size_t meta_cnt; + volatile size_t bytes; + volatile shm_data_state_e state; +} shm_data_header; -/* unmaps shared memory region from memory, and releases it, - * caller should povider the address of a pointer to the region - * in paddr, sets paddr to NULL on return, - * returns UNIFYFS_SUCCESS on success */ -int unifyfs_shm_free(const char* name, size_t size, void** paddr); +/* Context structure for maintaining state on an active + * shared memory region */ +typedef struct shm_context { + char name[SHMEM_NAME_LEN]; + void* addr; /* base address of shmem region mapping */ + size_t size; /* size of shmem region */ +} shm_context; + +/** + * Allocate a shared memory region with given name and size, + * and map it into memory. + * @param name region name + * @param size region size in bytes + * @return shmem context pointer (NULL on failure) + */ +shm_context* unifyfs_shm_alloc(const char* name, size_t size); + +/** + * Unmaps shared memory region and frees its context. Context pointer + * is set to NULL on success. + * @param pctx address of pointer to the shmem context + * @return UNIFYFS_SUCCESS or error code + */ +int unifyfs_shm_free(shm_context** pctx); + +/** + * Unlinks the file used to attach to shared memory region. + * @param ctx shmem context pointer + * @return UNIFYFS_SUCCESS or error code + */ +int unifyfs_shm_unlink(shm_context* ctx); #ifdef __cplusplus } // extern "C" diff --git a/configure.ac b/configure.ac index 8d7b802c3..c43897743 100755 --- a/configure.ac +++ b/configure.ac @@ -1,12 +1,15 @@ -dnl +dnl dnl This file is a part of UnifyFS. Please see LICENSE for the license dnl information. dnl Process this file with autoconf to produce a configure script. +AC_LANG([C]) + AC_INIT([unifyfs], m4_esyscmd([git describe --always | awk '/.*/{sub(/^v/,""); printf "%s",$1; exit}']), [unifycr@llnl.gov]) +AC_PREREQ(2.60) AC_CONFIG_SRCDIR([configure.ac]) AC_CONFIG_HEADERS([config.h]) AC_CONFIG_MACRO_DIR([m4]) @@ -16,32 +19,27 @@ AM_SILENT_RULES([yes]) AM_MAINTAINER_MODE([disable]) -LT_INIT - AC_PROG_CC_STDC AC_PROG_AWK AC_PROG_CPP AC_PROG_INSTALL AC_PROG_LN_S AC_PROG_MAKE_SET -AC_PROG_RANLIB - # fortran support -AC_ARG_ENABLE([fortran], - AC_HELP_STRING([--enable-fortran], - [Enable fortran compatibility and features])) +AC_ARG_ENABLE([fortran],[AS_HELP_STRING([--enable-fortran],[Enable fortran compatibility and features])]) AC_MSG_CHECKING(if fortran is wanted ) -if test "x$enable_fortran" = "xyes"; then -AC_MSG_RESULT(yes) -AC_PROG_FC -AC_FC_LIBRARY_LDFLAGS -AC_FC_DUMMY_MAIN -AM_CONDITIONAL([HAVE_FORTRAN], [true]) -else -AC_MSG_RESULT(no) -AM_CONDITIONAL([HAVE_FORTRAN], [false]) -fi +AS_IF([test "x$enable_fortran" = "xyes"],[ + AC_MSG_RESULT(yes) + AC_PROG_FC + AM_CONDITIONAL([HAVE_FORTRAN], [true]) +],[ + AC_MSG_RESULT(no) + AM_CONDITIONAL([HAVE_FORTRAN], [false]) +]) + +dnl Need to do Fortran checks before initializing LIBTOOL +LT_INIT # Checks for typedefs, structures, and compiler characteristics. AC_TYPE_MODE_T @@ -62,12 +60,11 @@ AC_CHECK_MEMBERS([struct stat.st_rdev]) AC_CHECK_TYPES([ptrdiff_t]) # Checks for header files. -AC_CHECK_HEADERS([fcntl.h limits.h stdlib.h string.h sys/socket.h sys/time.h]) -AC_CHECK_HEADERS([unistd.h arpa/inet.h inttypes.h netdb.h netinet/in.h]) -AC_CHECK_HEADERS([stddef.h stdint.h libgen.h strings.h syslog.h]) -AC_CHECK_HEADERS([inttypes.h wchar.h wctype.h]) -AC_CHECK_HEADER([openssl/md5.h], [], [AC_MSG_FAILURE([ - *** openssl/md5.h missing, openssl-devel package required])]) +AC_CHECK_HEADERS([stddef.h stdint.h stdlib.h string.h unistd.h]) +AC_CHECK_HEADERS([fcntl.h inttypes.h libgen.h limits.h mntent.h strings.h syslog.h]) +AC_CHECK_HEADERS([wchar.h wctype.h]) +AC_CHECK_HEADERS([sys/mount.h sys/socket.h sys/statfs.h sys/time.h]) +AC_CHECK_HEADERS([arpa/inet.h netdb.h netinet/in.h]) # Checks for library functions. AC_FUNC_MALLOC @@ -79,69 +76,103 @@ AC_CHECK_FUNCS([gethostbyname strcasecmp strdup strerror strncasecmp strrchr]) AC_CHECK_FUNCS([gethostname strstr strtoumax strtol uname posix_fallocate]) # PMPI Init/Fini mount/unmount option -AC_ARG_ENABLE([mpi-mount], - AC_HELP_STRING([--enable-mpi-mount], - [Enable transparent mount/unmount at MPI_Init/Finalize.])) -if test "x$enable_mpi_mount" = "xyes"; then -AM_CONDITIONAL([USE_PMPI_WRAPPERS], [true]) -else -AM_CONDITIONAL([USE_PMPI_WRAPPERS], [false]) -fi +AC_ARG_ENABLE([mpi-mount],[AS_HELP_STRING([--enable-mpi-mount],[Enable transparent mount/unmount at MPI_Init/Finalize.])]) +AS_IF([test "x$enable_mpi_mount" = "xyes"],[ + AM_CONDITIONAL([USE_PMPI_WRAPPERS], [true]) +],[ + AM_CONDITIONAL([USE_PMPI_WRAPPERS], [false]) +]) # PMIx support build option -AC_ARG_ENABLE([pmix], - AC_HELP_STRING([--enable-pmix], - [Enable PMIx build options.])) -if test "x$enable_pmix" = "xyes"; then -AC_CHECK_HEADERS([pmix.h], - [AM_CONDITIONAL([USE_PMIX], [true])], - [AM_CONDITIONAL([USE_PMIX], [false])]) -else -AM_CONDITIONAL([USE_PMIX], [false]) -fi +AC_ARG_ENABLE([pmix],[AS_HELP_STRING([--enable-pmix],[Enable PMIx build options.])]) +AS_IF([test "x$enable_pmix" = "xyes"],[ + AC_CHECK_HEADERS([pmix.h], + [AM_CONDITIONAL([USE_PMIX], [true])], + [AM_CONDITIONAL([USE_PMIX], [false])]) +],[ + AM_CONDITIONAL([USE_PMIX], [false]) +]) # PMI2 support build option -AC_ARG_ENABLE([pmi], - AC_HELP_STRING([--enable-pmi], - [Enable PMI2 build options.])) -if test "x$enable_pmi" = "xyes"; then -AC_CHECK_HEADERS([pmi2.h], - [AM_CONDITIONAL([USE_PMI2], [true])], - [AM_CONDITIONAL([USE_PMI2], [false])]) -else -AM_CONDITIONAL([USE_PMI2], [false]) -fi +AC_ARG_ENABLE([pmi],[AS_HELP_STRING([--enable-pmi],[Enable PMI2 build options.])]) +AS_IF([test "x$enable_pmi" = "xyes"],[ + AC_CHECK_HEADERS([pmi2.h], + [AM_CONDITIONAL([USE_PMI2], [true])], + [AM_CONDITIONAL([USE_PMI2], [false])]) +],[ + AM_CONDITIONAL([USE_PMI2], [false]) +]) -CHECK_NUMA +# MDHIM support build option +AC_ARG_ENABLE([mdhim],[AS_HELP_STRING([--enable-mdhim],[Enable MDHIM build options.])]) +AS_IF([test "x$enable_mdhim" = "xyes"],[ + AM_CONDITIONAL([USE_MDHIM], [true]) + UNIFYFS_AC_LEVELDB +],[ + AM_CONDITIONAL([USE_MDHIM], [false]) +]) AC_ARG_WITH(pkgconfigdir, - [ --with-pkgconfigdir=DIR pkgconfig file in DIR @<:@LIBDIR/pkgconfig@:>@], + [AS_HELP_STRING([--with-pkgconfigdir=DIR],[pkgconfig file in DIR @<:@LIBDIR/pkgconfig@:>@])], [pkgconfigdir=$withval], [pkgconfigdir='${libdir}/pkgconfig']) AC_SUBST(pkgconfigdir) ## unifyfs options -AC_ARG_ENABLE(cuserid, -[ --disable-cuserid Disables attempted use of cuserid() at run time], -[if test "x$enableval" = "xno" ; then - AC_DEFINE(CRUISE_DISABLE_CUSERID, 1, Define if cuserid() should be disabled), -fi] +AC_ARG_ENABLE([cuserid],[AS_HELP_STRING([--disable-cuserid],[Disables attempted use of cuserid() at run time])],[ + AS_IF([test "x$enableval" = "xno"],[ + AC_DEFINE(CRUISE_DISABLE_CUSERID, 1, Define if cuserid() should be disabled),],[])] ,) -AC_ARG_ENABLE(ld-preload, -[ --disable-ld-preload Disables support for LD_PRELOAD library], -[if test "x$enableval" = "xno" ; then - DISABLE_LDPRELOAD="1" -fi] -,) +AC_ARG_ENABLE(ld-preload,[AS_HELP_STRING([--disable-ld-preload],[Disables support for LD_PRELOAD library])],[ + AS_IF([test "x$enableval" = "xno"],[ + DISABLE_LDPRELOAD="1" + ],[]) + ] +,[]) + +AC_ARG_ENABLE(st-dev-workaround, + [AS_HELP_STRING([--enable-st-dev-workaround],[Gather device id from parent directory instead of file])],[ + AS_IF([test "x$enableval" = "xyes"],[ + AC_DEFINE(__CP_ST_DEV_WORKAROUND, 1, Define if device id should be taken from parent directory rather than file) + DISABLE_LDPRELOAD="1" + ],[]) + ] +,[]) + +# use mpi for bootstraping unifyfsd +AC_ARG_ENABLE(unifyfsd-mpi, + [AS_HELP_STRING([--enable-unifyfsd-mpi],[Use MPI for bootstrapping unifyfsd])],[ + AS_IF([test "x$enableval" = "xyes"],[ + AC_DEFINE(UNIFYFSD_USE_MPI, 1, Define if unifyfsd has to use mpi for bootstrapping)],[]) + ] +,[]) -AC_ARG_ENABLE(st-dev-workaround, -[ --enable-st-dev-workaround Gather device id from parent directory instead of file], -[if test "x$enableval" = "xyes" ; then - AC_DEFINE(__CP_ST_DEV_WORKAROUND, 1, Define if device id should be taken from parent directory rather than file) - DISABLE_LDPRELOAD="1" -fi] -,) + +# look for MPI and set flags +LX_FIND_MPI +AS_IF([test "x$enable_fortran" = "xyes"],[ + AC_LANG_PUSH([Fortran]) + LX_FIND_MPI + AC_LANG_POP +],[]) + +AS_IF([test "$have_C_mpi" != "yes"], + AC_MSG_ERROR(["Couldn't find MPI"]), + [] +) + +# look for gotcha library, sets GOTCHA_INCLUDE, GOTCHA_LIB +UNIFYFS_AC_GOTCHA + +# look for spath library, sets SPATH_CFLAGS, SPATH_LDFLAGS, SPATH_LIBS +UNIFYFS_AC_SPATH + +# look for margo library, sets MARGO_CFLAGS, MARGO_LIBS +UNIFYFS_AC_MARGO + +# openssl for md5 checksum +UNIFYFS_AC_OPENSSL # checks to see how we can print 64 bit values on this architecture gt_INTTYPES_PRI @@ -164,28 +195,20 @@ AC_TRY_COMPILE( AC_MSG_RESULT(no) ) -AC_CHECK_HEADERS(mntent.h sys/mount.h) - -# look for MPI and set flags -LX_FIND_MPI -if test "x$enable_fortran" = "xyes"; then -AC_LANG_PUSH([Fortran]) -LX_FIND_MPI -AC_LANG_POP -fi - -if test "$have_C_mpi" != "yes" ; then - AC_MSG_ERROR(["Couldn't find MPI"]) -fi - -# look for leveldb library, sets LEVELDB_CFLAGS/LDFLAGS/LIBS -UNIFYFS_AC_LEVELDB - -# look for gotcha library, sets GOTCHA_INCLUDE, GOTCHA_LIB -UNIFYFS_AC_GOTCHA +AC_MSG_CHECKING(if linker supports -wrap) +OLD_LDFLAGS=$LDFLAGS +LDFLAGS=$LDFLAGS +LDFLAGS+="-Wl,-wrap,malloc" +AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]],[[void* __wrap_malloc(size_t size);]],[[int *test = malloc(sizeof(int));]])], +[ + AC_MSG_RESULT([yes]) + AM_CONDITIONAL([HAVE_LD_WRAP],[true]) +],[ + AC_MSG_RESULT([no]) + AM_CONDITIONAL([HAVE_LD_WRAP],[false]) +]) +LDFLAGS=$OLD_LDFLAGS -UNIFYFS_AC_MARGO -UNIFYFS_AC_FLATCC # HDF found? AX_LIB_HDF5 @@ -196,21 +219,52 @@ AM_CONDITIONAL([HAVE_HDF5], [test x$with_hdf5 = xyes]) CP_WRAPPERS+="-Wl,-wrap,access" CP_WRAPPERS+=",-wrap,chmod" CP_WRAPPERS+=",-wrap,fchmod" -CP_WRAPPERS+=",-wrap,lio_listio" + +OLD_LIBS=$LIBS +LIBS+=" -lrt" +AC_CHECK_FUNCS(lio_listio,[ + CP_WRAPPERS+=",-wrap,lio_listio" +], []) +LIBS=$OLD_LIBS + CP_WRAPPERS+=",-wrap,mkdir" CP_WRAPPERS+=",-wrap,rmdir" +CP_WRAPPERS+=",-wrap,chdir" +CP_WRAPPERS+=",-wrap,__getcwd_chk" +CP_WRAPPERS+=",-wrap,getcwd" +CP_WRAPPERS+=",-wrap,getwd" +CP_WRAPPERS+=",-wrap,get_current_dir_name" CP_WRAPPERS+=",-wrap,unlink" CP_WRAPPERS+=",-wrap,remove" CP_WRAPPERS+=",-wrap,rename" CP_WRAPPERS+=",-wrap,truncate" CP_WRAPPERS+=",-wrap,stat" -CP_WRAPPERS+=",-wrap,__lxstat" -CP_WRAPPERS+=",-wrap,__xstat" +CP_WRAPPERS+=",-wrap,fstat" + + +AC_CHECK_FUNCS(statfs,[ + CP_WRAPPERS+=",-wrap,statfs" +],[]) +AC_CHECK_FUNCS(fstatfs,[ + CP_WRAPPERS+=",-wrap,fstatfs" +],[]) + +AC_CHECK_FUNCS(__lxstat,[ + CP_WRAPPERS+=",-wrap,__lxstat" +],[]) +AC_CHECK_FUNCS(__xstat,[ + CP_WRAPPERS+=",-wrap,__xstat" +],[]) +AC_CHECK_FUNCS(__fxstat,[ + CP_WRAPPERS+=",-wrap,__fxstat" +],[]) CP_WRAPPERS+=",-wrap,creat" CP_WRAPPERS+=",-wrap,creat64" CP_WRAPPERS+=",-wrap,open" -CP_WRAPPERS+=",-wrap,open64" +AC_CHECK_FUNCS(open64, [ + CP_WRAPPERS+=",-wrap,open64" +],[]) CP_WRAPPERS+=",-wrap,__open_2" CP_WRAPPERS+=",-wrap,read" CP_WRAPPERS+=",-wrap,write" @@ -220,9 +274,12 @@ CP_WRAPPERS+=",-wrap,pread" CP_WRAPPERS+=",-wrap,pread64" CP_WRAPPERS+=",-wrap,pwrite" CP_WRAPPERS+=",-wrap,pwrite64" -CP_WRAPPERS+=",-wrap,posix_fadvise" +AC_CHECK_FUNCS(posix_fadvise, [ + CP_WRAPPERS+=",-wrap,posix_fadvise" +],[]) CP_WRAPPERS+=",-wrap,lseek" CP_WRAPPERS+=",-wrap,lseek64" +CP_WRAPPERS+=",-wrap,fchdir" CP_WRAPPERS+=",-wrap,ftruncate" CP_WRAPPERS+=",-wrap,fsync" CP_WRAPPERS+=",-wrap,fdatasync" @@ -231,7 +288,6 @@ CP_WRAPPERS+=",-wrap,mmap" CP_WRAPPERS+=",-wrap,mmap64" CP_WRAPPERS+=",-wrap,munmap" CP_WRAPPERS+=",-wrap,msync" -CP_WRAPPERS+=",-wrap,__fxstat" CP_WRAPPERS+=",-wrap,close" # FILE* functions @@ -288,11 +344,11 @@ CP_WRAPPERS+=",-wrap,ungetwc" # ,-u,__wrap___fxstat64,-u,pthread_mutex_lock,-u,pthread_mutex_unlock -# We need to know the value of the $libdir and $bindir variables so that +# We need to know the value of the $libdir and $bindir variables so that # we can reference the correct path in the unifyfs compiler wrappers. -# Unfortunately, those two variables are not normally evaluated by autoconf. -# They are evaluated at build time using Makefile variable substitutions. -# +# Unfortunately, those two variables are not normally evaluated by autoconf. +# They are evaluated at build time using Makefile variable substitutions. +# # The following logic was copied from mpich2 1.3.1 to resolve the $libdir # variable at configure time. # @@ -339,7 +395,9 @@ AC_CONFIG_FILES([Makefile util/scripts/Makefile util/scripts/lsfcsm/Makefile util/unifyfs/Makefile - util/unifyfs/src/Makefile]) + util/unifyfs/src/Makefile + util/unifyfs-stage/Makefile + util/unifyfs-stage/src/Makefile]) AC_CONFIG_FILES([client/unifyfs-config], [chmod +x client/unifyfs-config]) AC_CONFIG_FILES([util/scripts/lsfcsm/unifyfs_lsfcsm_prolog], [chmod +x util/scripts/lsfcsm/unifyfs_lsfcsm_prolog]) @@ -358,15 +416,15 @@ AC_OUTPUT AC_MSG_RESULT([ ========================== - UNIFYFS Checkpoint-Restart + UNIFYFS ========================== prefix ${prefix} compiler ${CC} CFLAGS ${CFLAGS} ========================== - + Supported POSIX wrappers: - + ${CP_WRAPPERS} ]) diff --git a/docs/add-rpcs.rst b/docs/add-rpcs.rst index e4691e145..4c95951ae 100644 --- a/docs/add-rpcs.rst +++ b/docs/add-rpcs.rst @@ -18,9 +18,9 @@ Common The struct definition macro `MERCURY_GEN_PROC()` is used to define both input and output parameters. For client-server RPCs, the - definitions should be placed in `common/src/unifyfs_clientcalls_rpc.h`, + definitions should be placed in `common/src/unifyfs_client_rpcs.h`, while server-server RPC structs are defined in - `common/src/unifyfs_servercalls_rpc.h`. + `common/src/unifyfs_server_rpcs.h`. The input parameters struct should contain all values the client needs to pass to the server handler function. @@ -30,23 +30,12 @@ Common .. code-block:: C MERCURY_GEN_PROC(unifyfs_mount_in_t, - ((int32_t)(app_id)) - ((int32_t)(local_rank_idx)) ((int32_t)(dbg_rank)) - ((int32_t)(num_procs_per_node)) - ((hg_const_string_t)(client_addr_str)) - ((hg_size_t)(req_buf_sz)) - ((hg_size_t)(recv_buf_sz)) - ((hg_size_t)(superblock_sz)) - ((hg_size_t)(meta_offset)) - ((hg_size_t)(meta_size)) - ((hg_size_t)(fmeta_offset)) - ((hg_size_t)(fmeta_size)) - ((hg_size_t)(data_offset)) - ((hg_size_t)(data_size)) - ((hg_const_string_t)(external_spill_dir))) + ((hg_const_string_t)(mount_prefix)) + ((hg_const_string_t)(client_addr_str))) MERCURY_GEN_PROC(unifyfs_mount_out_t, - ((hg_size_t)(max_recs_per_slice)) + ((int32_t)(app_id)) + ((int32_t)(client_id)) ((int32_t)(ret))) .. note:: @@ -66,8 +55,7 @@ Server This is the function that will be invoked on the client and executed on the server. Client-server RPC handler functions are implemented in `server/src/unifyfs_cmd_handler.c`, while server-server RPC handlers go - in `server/src/unifyfs_service_manager.c`. The RPC handler input and output - parameters structs are defined in `common/src/unifyfs_clientcalls_rpc.h`. + in `server/src/unifyfs_service_manager.c`. All the RPC handler functions follow the same protoype, which is passed a Mercury handle as the only argument. The handler function should use diff --git a/docs/api-mount.rst b/docs/api-mount.rst index 5ce67a556..06ea571f3 100644 --- a/docs/api-mount.rst +++ b/docs/api-mount.rst @@ -19,17 +19,19 @@ In this section, we describe how to use the UnifyFS API in an application. Mounting --------------------------- -In ``C`` applications, include *unifyfs.h*. See writeread.c_ for a full +In C or C++ applications, include ``unifyfs.h``. See writeread.c_ for a full example. .. code-block:: C + :caption: C #include -In ``Fortran`` applications, include *unifyfsf.h*. See writeread.f90_ for a +In Fortran applications, include ``unifyfsf.h``. See writeread.f90_ for a full example. .. code-block:: Fortran + :caption: Fortran include 'unifyfsf.h' @@ -48,7 +50,7 @@ filesystem. For instance, to use UnifyFS on all path prefixes that begin with call UNIFYFS_MOUNT('/tmp', rank, size, 0, ierr); -Where /tmp is the path prefix you want UnifyFS to intercept. The rank and rank +Where ``/tmp`` is the path prefix you want UnifyFS to intercept. The rank and rank number is the rank you are currently on, and the number of tasks you have running in your job. Lastly, the zero corresponds to the app id. @@ -61,17 +63,13 @@ When you are finished using UnifyFS in your application, you should unmount. .. code-block:: C :caption: C - if (rank == 0) { - unifyfs_unmount(); - } + unifyfs_unmount(); .. code-block:: Fortran :caption: Fortran call UNIFYFS_UNMOUNT(ierr); -It is only necessary to call unmount once on rank zero. - .. explicit external hyperlink targets .. _ifort_issue: https://github.com/LLNL/UnifyFS/issues/300 diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 0c099c2dc..c2af68b97 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -5,63 +5,84 @@ Assumptions In this section, we provide assumptions we make about the behavior of applications that use UnifyFS, and about how UnifyFS currently functions. +--------------------------- +System Requirements +--------------------------- + +UnifyFS uses node-local storage devices, e.g., RAM and SSD, for storing the +application data. Therefore, a system should support the following requirements +to run UnifyFS. + + - A compute node is equipped with a local storage device that UnifyFS can + use for storing file data, e.g., SSD or RAM. + + - An ability for UnifyFS to launch user-level daemon processes on compute + nodes, which run concurrently with user application processes + --------------------------- Application Behavior --------------------------- - - Workload supported is globally synchronous checkpointing. - - I/O occurs in write and read phases. Files are not read and written at - the same time. There is some (good) amount of time between the two phases. - For example, files are written during checkpoint phases and only read - during recovery or restart. +UnifyFS is specifically designed to support globally synchronous checkpointing +workloads. In such a workload, the expected application behavior is as follows. - - Processes on any node can read any byte in the file (not just local - data), but the common case will be processes read only their local bytes. + - I/O operations occur in separate write and read phases, and thus files are + not read and written simultaneously. For instance, files are only written + during the checkpointing (a write phase) and only read during the + recovery/restart (a read phase). - - Assume general parallel I/O concurrency semantics where processes can - write to the same offset concurrently. We assume the outcome of concurrent - writes to the same offset or other conflicting concurrent accesses is - undefined. For example, if a command in the job renames a file while the - parallel application is writing to it, the outcome is undefined. It could - be a failure or not, depending on timing. + - During the read phase, a process can read any byte in a file including + remote data that has been written by processes in remote compute nodes. + However, reading the local data (which has been written by processes in + the same compute node) will be faster than reading the remote data. + + - During the write phase, the result of concurrently writing to the same + file offset by multiple processes is undefined. Similarly, multiple + processes writing to an overlapped region also leads to an undefined + result. For example, if a command in the job renames a file while the + parallel application is writing to it, the outcome is undefined, i.e., it + could be a success or failure depending on timing. --------------------------- Consistency Model --------------------------- + One key aspect of UnifyFS is the idea of "laminating" a file. After a file is -laminated, it becomes "set in stone" and its data is accessible across all the -nodes. Laminated files are permanently read-only, and cannot be modified in -any way (but can be deleted). If the application process group fails before a -file has been laminated, UnifyFS may delete the file. +laminated, it becomes "set in stone," and its data is accessible across all the +nodes. Laminated files are permanently read-only and cannot be further modified, +except for being renamed or deleted. If the application process group fails +before a file has been laminated, UnifyFS may delete the file. -The typical use case is to laminate your checkpoint files after they've been -written. To laminate a file, first call fsync() to sync all your writes to the -server, then call chmod() to remove all the write bits. Removing the write -bits does the actual lamination. A typical checkpoint will look like this: +A typical use case is to laminate application checkpoint files after they have +been successfully written. To laminate a file, an application can simply call +chmod() to remove all the write bits, after its write phase is completed. When +write bits of a file are all canceled, UnifyFS will internally laminate the +file. A typical checkpoint will look like: .. code-block:: C - fp = fopen("checkpoint1.chk") - write(fp, ) - fsync(fp) - fclose(fp) + fd = open("checkpoint1.chk", O_WRONLY) + write(fd, , ) + close(fd) chmod("checkpoint1.chk", 0444) Future versions of UnifyFS may support different laminate semantics, such as -laminate on close(), or laminate via an explicit API call. +laminate on close() or laminate via an explicit API call. + +We define the laminated consistency model to enable certain optimizations while +supporting the perceived requirements of application checkpoints. Since remote +processes are not permitted to read arbitrary bytes of a file until its +lamination, UnifyFS can buffer all data and metadata of the file locally +(instead of exchanging indexing information between compute nodes) before the +lamination occurs. Also, since file contents cannot change after lamination, +aggressive caching may be used during the read phase with minimal locking. +Further, since a file may be lost on application failure unless laminated, data +redundancy schemes can be delayed until lamination. -We define the laminated consistency model to enable certain optimizations -while supporting the perceived requirements of application checkpoints. -Since remote processes are not permitted to read arbitrary bytes within the -file until lamination, -global exchange of file data and/or data index information can be buffered -locally on each node until the point of lamination. -Since file contents cannot change after lamination, -aggressive caching may be used during the read-only phase with minimal locking. -Since a file may be lost on application failure unless laminated, -data redundancy schemes can be delayed until lamination. +The following lists summarize available application I/O operations according to +our consistency model. -Behavior before lamination: +Behavior before lamination (write phase): - open/close: A process may open/close a file multiple times. @@ -77,7 +98,7 @@ Behavior before lamination: - unlink: A process may delete a file. -Behavior after lamination: +Behavior after lamination (read phase): - open/close: A process may open/close a file multiple times. @@ -95,32 +116,32 @@ Behavior after lamination: File System Behavior --------------------------- - - The file system exists on node local storage only and is not persisted to - stable storage like a parallel file system (PFS). Can be coupled with +The additional behavior of UnifyFS can be summarized as follows. - - SymphonyFS or high level I/O or checkpoint library (VeloC) to move data to - PFS periodically, or data can be moved manually + - UnifyFS exists on node local storage only and is not automatically + persisted to stable storage like a parallel file system (PFS). When the + data needs to be persisted to an external file system, users can use + :ref:`unifyfs utility ` with its data staging + options. - - Can be used with checkpointing libraries (VeloC) or I/O libraries to - support shared files on burst buffers + - UnifyFS also can be coupled with SymphonyFS_, high level I/O libraries, or + a checkpoint library (VeloC_) to move data to PFS periodically. - - File system starts empty at job start. User job must populate the file - system. + - UnifyFS can be used with checkpointing libraries (VeloC_) or other I/O + libraries to support shared files on burst buffers. - - Shared file system namespace across all compute nodes in a job, even if - an application process is not running on all compute nodes + - UnifyFS starts empty at job start. User job must populate the file system + manually or by using + :ref:`unifyfs utility `. - - Survives application termination and/or relaunch within a job + - UnifyFS creates a shared file system namespace across all compute nodes in + a job, even if an application process is not running on all compute nodes. - - Will transparently intercept system level I/O calls of applications and - I/O libraries + - UnifyFS survives across multiple application runs within a job. ---------------------------- -System Characteristics ---------------------------- + - UnifyFS will transparently intercept system level I/O calls of + applications and I/O libraries. - - There is some storage available for storing file data on a compute node, - e.g. SSD or RAM disk +.. _SymphonyFS: https://code.ornl.gov/techint/SymphonyFS +.. _VeloC: https://github.com/ECP-VeloC/VELOC - - We can run user-level daemon processes on compute nodes concurrently with - a user application diff --git a/docs/build-intercept.rst b/docs/build-intercept.rst index 2989c4cb0..4887bff4f 100644 --- a/docs/build-intercept.rst +++ b/docs/build-intercept.rst @@ -4,55 +4,106 @@ Build & I/O Interception In this section, we describe how to build UnifyFS with I/O interception. -.. note:: - - The current version of UnifyFS adopts the mdhim key-value store, which strictly - requires: +--------------------------- - "An MPI distribution that supports MPI_THREAD_MULTIPLE and per-object locking of - critical sections (this excludes OpenMPI up to version 3.0.1, the current version as of this writing)" +--------------------------------------- +UnifyFS Build Configuration Options +--------------------------------------- - as specified in the project `github `_. +Fortran +******* + +To enable UnifyFS use with Fortran applications, pass the ``--enable-fortran`` +option to configure. Note that only GCC Fortran (i.e., gfortran) is known to +work with UnifyFS. There is an open +`ifort_issue `_ with the Intel +Fortran compiler as well as an +`xlf_issue <://github.com/LLNL/UnifyFS/issues/304>`_ with the IBM Fortran +compiler. + +GOTCHA +****** + +GOTCHA is the preferred method for I/O interception with UnifyFS, but it is not +available on all platforms. If GOTCHA is not available on your target system, +you can omit it during UnifyFS configuration by using the ``--without-gotcha`` +configure option. Without GOTCHA, static linker wrapping is required for I/O +interception. + +HDF5 +**** + +UnifyFS includes example programs that use HDF5. If HDF5 is not available on +your target system, it can be omitted during UnifyFS configuration by using +the ``--without-hdf5`` configure option. + +MDHIM +***** + +Previous MDHIM-based support for file operations can be selected at configure +time using the ``--enable-mdhim`` option. Using this option requires LevelDB as +a dependency. Provide the path to your LevelDB install at configure time with +the ``--with-leveldb=/path/to/leveldb`` option. Note that this may not +currently be in a usable state. + +PMI2/PMIx Key-Value Store +************************* + +When available, UnifyFS uses the distributed key-value store capabilities +provided by either PMI2 or PMIx. To enable this support, pass either +the ``--enable-pmi`` or ``--enable-pmix`` option to configure. Without +PMI support, a distributed file system accessible to all servers is required. + +Transparent Mounting for MPI Applications +***************************************** + +MPI applications written in C or C++ may take advantage of the UnifyFS transparent +mounting capability. With transparent mounting, calls to ``unifyfs_mount()`` and +``unifyfs_unmount()`` are automatically performed during ``MPI_Init()`` and +``MPI_Finalize()``, respectively. Transparent mounting always uses ``/unifyfs`` as +the namespace mountpoint. To enable transparent mounting, use the +``--enable-mpi-mount`` configure option. -.. _build-label: +--------------------------- --------------------------- -How to Build UnifyFS +Building UnifyFS with Spack --------------------------- +Full Build +********** + To install all dependencies and set up your build environment, we recommend using the `Spack package manager `_. If you already have Spack, make sure you have the latest release or if using a clone of their develop branch, ensure you have pulled the latest changes. -Building with Spack -******************** - -These instructions assume that you do not already have a module system installed -such as LMod, Dotkit, or Environment Modules. If your system already has Dotkit -or LMod installed then installing the environment-modules package with Spack -is unnecessary (so you can safely skip that step). +.. _build-label: -If you use Dotkit then replace ``spack load`` with ``spack use``. -First, install Spack if you don't already have it: +Install Spack +^^^^^^^^^^^^^ .. code-block:: Bash $ git clone https://github.com/spack/spack - $ ./spack/bin/spack install environment-modules + $ # optionally create a packages.yaml specific to your machine $ . spack/share/spack/setup-env.sh Make use of Spack's `shell support `_ to automatically add Spack to your ``PATH`` and allow the use of the ``spack`` command. -Then install UnifyFS: +Install UnifyFS +^^^^^^^^^^^^^^^ .. code-block:: Bash $ spack install unifyfs $ spack load unifyfs +If the most recent changes on the development branch ('dev') of UnifyFS are +desired, then do ``spack install unifyfs@develop``. + .. Edit the following admonition if the default of variants are changed or when new variants are added. @@ -62,18 +113,19 @@ build is desired. Type ``spack info unifyfs`` for more info. .. table:: UnifyFS Build Variants :widths: auto - ======= ======================================== ========================= - Variant Command Description - ======= ======================================== ========================= - HDF5 ``spack install unifyfs+hdf5`` Build with parallel HDF5 + ========== ======================================== =========================== + Variant Command Description + ========== ======================================== =========================== + Auto-mount ``spack install unifyfs+auto-mount`` Enable transparent mounting + HDF5 ``spack install unifyfs+hdf5`` Build with parallel HDF5 - ``spack install unifyfs+hdf5 ^hdf5~mpi`` Build with serial HDF5 - Fortran ``spack install unifyfs+fortran`` Build with gfortran - NUMA ``spack install unifyfs+numa`` Build with NUMA - pmpi ``spack install unifyfs+pmpi`` Transparent mount/unmount - PMI ``spack install unifyfs+pmi`` Enable PMI2 build options - PMIx ``spack install unifyfs+pmix`` Enable PMIx build options - ======= ======================================== ========================= + ``spack install unifyfs+hdf5 ^hdf5~mpi`` Build with serial HDF5 + Fortran ``spack install unifyfs+fortran`` Enable Fortran support + MDHIM ``spack install unifyfs+mdhim`` Enable MDHIM build options + PMI ``spack install unifyfs+pmi`` Enable PMI2 support + PMIx ``spack install unifyfs+pmix`` Enable PMIx support + spath ``spack install unifyfs+spath`` Normalize relative paths + ========== ======================================== =========================== .. attention:: @@ -88,34 +140,28 @@ build is desired. Type ``spack info unifyfs`` for more info. --------------------------- -Building with Autotools -************************ - -Download the latest UnifyFS release from the `Releases -`_ page. - -Building the Dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Manual Build +************ -UnifyFS requires MPI, LevelDB, GOTCHA(version 0.0.2), FlatCC, and Margo. -References to these dependencies can be found :doc:`here `. +Optionally, you can install the dependencies with Spack and still build UnifyFS +manually. This is useful if wanting to be able to edit the UnifyFS source code +between builds, but still letting Spack take care of the dependencies. Take +advantage of +`Spack Environments `_ +to streamline this process. .. _spack-build-label: -Build the Dependencies with Spack -"""""""""""""""""""""""""""""""""" - -Once Spack is installed on your system (see :ref:`above `), you -can install just the dependencies for an easier manual installation of UnifyFS. +Build the Dependencies +^^^^^^^^^^^^^^^^^^^^^^ -If you use Dotkit then replace ``spack load`` with ``spack use``. +Once Spack is installed on your system (see :ref:`above `), the +UnifyFS dependencies can then be installed. .. code-block:: Bash - $ spack install leveldb - $ spack install gotcha@0.0.2 - $ spack install flatcc - $ spack install margo + $ spack install gotcha + $ spack install margo ^mercury+bmi .. tip:: @@ -124,15 +170,22 @@ If you use Dotkit then replace ``spack load`` with ``spack use``. Keep in mind this will also install all the build dependencies and dependencies of dependencies if you haven't already installed them through - Spack or told Spack where they are locally installed on your system. + Spack or told Spack where they are locally installed on your system via a + `packages.yaml `_. -Then to manually build UnifyFS: +Build UnifyFS +^^^^^^^^^^^^^ + +Download the latest UnifyFS release from the `Releases +`_ page or clone the develop branch +from the `UnifyFS repository `_. + +Once the dependencies are installed, load them into your environment and then +manually build UnifyFS from inside the source code directory. .. code-block:: Bash - $ spack load leveldb - $ spack load gotcha@0.0.2 - $ spack load flatcc + $ spack load gotcha $ spack load mercury $ spack load argobots $ spack load margo @@ -142,36 +195,35 @@ Then to manually build UnifyFS: $ make $ make install -.. note:: **Fortran Compatibility** - - To build with gfortran compatibility, include the ``--enable-fortran`` - configure option: +To see all available build configuration options, run ``./configure --help`` +after ``./autogen.sh`` has been run. - ``./configure --prefix=/path/to/install/ --enable-fortran`` +--------------------------- - There is a known `ifort_issue `_ - with the Intel Fortran compiler as well as an `xlf_issue <://github.com/LLNL/UnifyFS/issues/304>`_ - with the IBM Fortran compiler. Other Fortran compilers are currently - unknown. +------------------------------- +Building UnifyFS with Autotools +------------------------------- -To see all available build configuration options, type ``./configure --help`` -after ``./autogen.sh`` has been run. +Download the latest UnifyFS release from the `Releases +`_ page or clone the develop branch +from the `UnifyFS repository `_. -.. TODO: Add a section in build docs that shows all the build config options +Build the Dependencies +********************** -Build the Dependencies without Spack -""""""""""""""""""""""""""""""""""""" +UnifyFS requires MPI, GOTCHA, Margo and OpenSSL. +References to these dependencies can be found on our :doc:`dependencies` page. -For users who cannot use Spack, a `bootstrap.sh `_ -script has been provided in order to make manual build and installation of -dependencies easier. Simply run the script in the top level directory of the source code. +A `bootstrap.sh `_ script +has been provided in order to make manual build and installation of dependencies +easier. Simply run the script in the top level directory of the source code. .. code-block:: Bash $ ./bootstrap.sh -References to the UnifyFS dependencies can be found :doc:`here `. - +Build UnifyFS +************* After bootstrap.sh is finished building the dependencies, it will print out the commands you need to run to build UnifyFS. The commands look something like @@ -181,20 +233,12 @@ this: $ export PKG_CONFIG_PATH=path/to/mercury/lib/pkgconfig:path/to/argobots/lib/pkgconfig:path/to/margo/lib/pkgconfig $ ./autogen.sh - $ ./configure --prefix=/path/to/install --with-gotcha=/path/to/gotcha --with-leveldb=/path/to/leveldb --with-flatcc=/path/to/flatcc + $ ./configure --prefix=/path/to/install --with-gotcha=/path/to/gotcha $ make $ make install -.. note:: - - You may need to add the following to your configure line if it is not in - your default path on a linux machine: - - ``--with-numa=$PATH_TO_NUMA`` - - This is needed to enable NUMA-aware memory allocation on Linux machines. Set the - NUMA policy at runtime with ``UNIFYFS_NUMA_POLICY = local | interleaved``, or set - NUMA nodes explicitly with ``UNIFYFS_USE_NUMA_BANK = `` +To see all available build configuration options, run ``./configure --help`` +after ``./autogen.sh`` has been run. --------------------------- diff --git a/docs/conf.py b/docs/conf.py index f1abca286..1f0ac6a60 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,17 +46,17 @@ # General information about the project. project = u'UnifyFS' -copyright = u'2017, Lawrence Livermore National Security LLC LLNL-CODE-741539, UT-Batelle LLC' -author = u'Kathryn Mohror, Adam Moody, Oral Sarp, Feiyi Wang, Hyogi Sim, Danielle Sikich, Joseph Moore, Ned Bass' +copyright = u'2020, Lawrence Livermore National Security LLC, LLNL-CODE-741539, UT-Batelle LLC' +author = u'Kathryn Mohror, Adam Moody, Oral Sarp, Feiyi Wang, Hyogi Sim, Swen Boehm, Michael Brim, Danielle Sikich, Joseph Moore, Ned Bass, Tony Hutter, Celso Mendes, Craig Steffen, Cameron Stanavige' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'0.1' +version = u'0.9.1' # The full version, including alpha/beta/rc tags. -release = u'0.1' +release = u'0.9.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -127,7 +127,7 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. -html_logo = 'images/unify-logo.png' +html_logo = 'images/UnifyFS-logo.png' # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 @@ -231,7 +231,7 @@ def setup(app): app.add_stylesheet("theme_overrides.css" ) # The name of an image file (relative to this directory) to place at the top of # the title page. -latex_logo = 'images/unify-logo.png' +latex_logo = 'images/UnifyFS-logo.png' # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. diff --git a/docs/configuration.rst b/docs/configuration.rst index 2a4f55ec1..6545487f6 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -14,11 +14,11 @@ certain settings have command line options. When defined via multiple methods, the command line options have the highest priority, followed by environment variables, and finally config file options from ``unifyfs.conf``. -The config file is installed in /etc by default. However, one can -specify a custom location for the -unifyfs.conf file with the -f command-line option to unifyfsd (see below). -There is a sample unifyfs.conf file in the installation directory -under etc/unifyfs/. This file is also available in the "extras" directory +The system-wide configuration file is used by default when available. +However, users can specify a custom location for the configuration file using +the ``-f`` command-line option to ``unifyfsd`` (see below). +There is a sample ``unifyfs.conf`` file in the installation directory +under ``etc/unifyfs/``. This file is also available in the ``extras`` directory in the source repository. The unified method for providing configuration control is adapted from @@ -50,37 +50,68 @@ a given section and key. .. table:: ``[unifyfs]`` section - main configuration settings :widths: auto - ============= ====== ===================================================== + ============= ====== =============================================== Key Type Description - ============= ====== ===================================================== + ============= ====== =============================================== cleanup BOOL cleanup storage on server exit (default: off) configfile STRING path to custom configuration file consistency STRING consistency model [ LAMINATED | POSIX | NONE ] daemonize BOOL enable server daemonization (default: off) mountpoint STRING mountpoint path prefix (default: /unifyfs) - ============= ====== ===================================================== + ============= ====== =============================================== .. table:: ``[client]`` section - client settings :widths: auto - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - max_files INT maximum number of open files per client process - ============= ====== ===================================================== + ================ ====== ================================================================= + Key Type Description + ================ ====== ================================================================= + cwd STRING effective starting current working directory + max_files INT maximum number of open files per client process (default: 128) + local_extents BOOL service reads from local data if possible (default: off) + recv_data_size INT maximum size (B) of memory buffer for receiving data from server + write_index_size INT maximum size (B) of memory buffer for storing write log metadata + write_sync BOOL sync data to server after every write (default: off) + ================ ====== ================================================================= + +The ``cwd`` setting is used to emulate the behavior one +expects when changing into a working directory before starting a job +and then using relative file names within the application. +If set, the application changes its working directory to +the value specified in ``cwd`` when ``unifyfs_mount`` is called. +The value specified in ``cwd`` must be within the directory space +of the UnifyFS mount point. + +Enabling the ``local_extents`` optimization may significantly improve read +performance. However, it should not be used by applications +in which different processes write to a given byte offset within +the file, nor should it be used with applications that truncate +files. .. table:: ``[log]`` section - logging settings :widths: auto - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - dir STRING path to directory to contain server log file - file STRING server log file base name (rank will be appended) - verbosity INT server logging verbosity level [0-5] (default: 0) - ============= ====== ===================================================== + ========== ====== ================================================== + Key Type Description + ========== ====== ================================================== + dir STRING path to directory to contain server log file + file STRING log file base name (rank will be appended) + verbosity INT logging verbosity level [0-5] (default: 0) + ========== ====== ================================================== -.. table:: ``[meta]`` section - metadata settings +.. table:: ``[logio]`` section - log-based write data storage settings + :widths: auto + + =========== ====== ============================================================ + Key Type Description + =========== ====== ============================================================ + chunk_size INT data chunk size (B) (default: 4 MiB) + shmem_size INT maximum size (B) of data in shared memory (default: 256 MiB) + spill_size INT maximum size (B) of data in spillover file (default: 1 GiB) + spill_dir STRING path to spillover data directory + =========== ====== ============================================================ + +.. table:: ``[meta]`` section - MDHIM metadata settings :widths: auto ============= ====== ===================================================== @@ -95,54 +126,30 @@ a given section and key. .. table:: ``[runstate]`` section - server runstate settings :widths: auto - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - dir STRING path to directory to contain server runstate file - ============= ====== ===================================================== + ======== ====== =============================================== + Key Type Description + ======== ====== =============================================== + dir STRING path to directory to contain server-local state + ======== ====== =============================================== .. table:: ``[server]`` section - server settings :widths: auto - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - hostfile STRING path to server hostfile - ============= ====== ===================================================== + ============ ====== ============================================================================= + Key Type Description + ============ ====== ============================================================================= + hostfile STRING path to server hostfile + init_timeout INT timeout in seconds to wait for servers to be ready for clients (default: 120) + ============ ====== ============================================================================= .. table:: ``[sharedfs]`` section - server shared files settings :widths: auto - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - dir STRING path to directory to contain server shared files - ============= ====== ===================================================== - -.. table:: ``[shmem]`` section - shared memory segment usage settings - :widths: auto - - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - chunk_bits INT data chunk size (bits), size = 2^bits (default: 24) - chunk_mem INT segment size (B) for data chunks (default: 256 MiB) - recv_size INT segment size (B) for receiving data from local server - req_size INT segment size (B) for sending requests to local server - single BOOL use one memory region for all clients (default: off) - ============= ====== ===================================================== - -.. table:: ``[spillover]`` section - local data storage spillover settings - :widths: auto - - ============= ====== ===================================================== - Key Type Description - ============= ====== ===================================================== - enabled BOOL use local storage for data spillover (default: on) - data_dir STRING path to spillover data directory - meta_dir STRING path to spillover metadata directory - size INT maximum size (B) of spillover data (default: 1 GiB) - ============= ====== ===================================================== + ======== ====== ================================================= + Key Type Description + ======== ====== ================================================= + dir STRING path to directory to contain server shared files + ======== ====== ================================================= ----------------------- @@ -165,6 +172,10 @@ command line options have long and short forms. The long form uses ``--section-key=value``, while the short form ``- value``, where the short option character is given in the below table. +Note that for configuration options of type BOOL, the value is optional. +When not provided, the ``true`` value is assumed. If the short form option +is used, the value must immediately follow the option character (e.g., ``-Cyes``). + .. table:: ``unifyfsd`` command line options :widths: auto @@ -176,11 +187,12 @@ the short option character is given in the below table. --unifyfs-consistency -c --unifyfs-daemonize -D --unifyfs-mountpoint -m - --log-dir -L - --log-file -l --log-verbosity -v + --log-file -l + --log-dir -L --runstate-dir -R --server-hostfile -H --sharedfs-dir -S + --server-init_timeout -t ====================== ======== diff --git a/docs/contribute-ways.rst b/docs/contribute-ways.rst index 1deac78f2..c67279bf2 100644 --- a/docs/contribute-ways.rst +++ b/docs/contribute-ways.rst @@ -102,6 +102,21 @@ able to quickly identify and resolve issues. Documentation ============= +Here is our current documentation of how the internals of UnifyFS function for +several basic operations. + +.. rubric:: UnifyFS Developer's Documentation + +.. image:: images/UnifyFS-developers-documentation.png + :target: slides/UnifyFS-developers-documentation.pdf + :height: 72px + :align: left + :alt: UnifyFS Developer's Documentation + +:download:`Download slides `. + +| + As UnifyFS is continually improved and updated, it is easy for documentation to become out-of-date. Any contributions to the documentation, no matter how small, is always greatly appreciated. If you are not in a position to update diff --git a/docs/dependencies.rst b/docs/dependencies.rst index aebb50112..fa6e7a664 100644 --- a/docs/dependencies.rst +++ b/docs/dependencies.rst @@ -2,19 +2,21 @@ UnifyFS Dependencies ==================== -- `GOTCHA `_ version 0.0.2 (compatibility with latest release in progress) +-------- +Required +-------- -- `leveldb `_ version 1.22 - -- `flatcc `_ version 0.5.3 +- `GOTCHA `_ version 1.0.3 - `Margo `_ version 0.4.3 and its dependencies: - - `Argobots `_ version 1.0rc1 + - `Argobots `_ version 1.0 - `Mercury `_ version 1.0.1 - `bmi `_ +- `OpenSSL `_ + .. important:: Margo uses pkg-config to ensure it compiles and links correctly with all of @@ -22,3 +24,12 @@ UnifyFS Dependencies ``PKG_CONFIG_PATH`` environment variable and include in that variable the paths for the ``.pc`` files for Mercury, Argobots, and Margo separated by colons. + +-------- +Optional +-------- + +- `leveldb `_ version 1.22 + needed when building with ``--enable-mdhim`` configure option + +- `spath `_ for normalizing relative paths diff --git a/docs/examples.rst b/docs/examples.rst index 1874d67a4..26b002736 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -31,8 +31,8 @@ To easily navigate to this location and find the examples, do: $ spack cd -i unifyfs $ cd libexec -Installed without Spack -^^^^^^^^^^^^^^^^^^^^^^^ +Installed with Autotools +^^^^^^^^^^^^^^^^^^^^^^^^ The autotools installation of UnifyFS will place the example programs in the *libexec/* directory of the path provided to ``--prefix=/path/to/install`` during @@ -67,8 +67,8 @@ The GOTCHA examples are one directory deeper in ``spack cd unifyfs+hdf5 ^hdf5~mpi`` -Built without Spack -^^^^^^^^^^^^^^^^^^^ +Built with Autotools +^^^^^^^^^^^^^^^^^^^^ The autotools build of UnifyFS will place the static and POSIX example programs in the *examples/src* directory and the GOTCHA example programs in the @@ -95,42 +95,50 @@ to aid in this process. Usage: write-static [options...] Available options: - -a, --appid= use given application id - (default: 0) - -A, --aio use asynchronous I/O instead of read|write - (default: off) - -b, --blocksize= I/O block size - (default: 16 MiB) - -c, --chunksize= I/O chunk size for each operation - (default: 1 MiB) - -d, --debug for debugging, wait for input (at rank 0) at start - (default: off) - -f, --file= target file name (or path) under mountpoint - (default: 'testfile') - -k, --check check data contents upon read - (default: off) - -L, --listio use lio_listio instead of read|write - (default: off) - -m, --mount= use for unifyfs - (default: /unifyfs) - -M, --mapio use mmap instead of read|write - (default: off) - -n, --nblocks= count of blocks each process will read|write - (default: 32) - -p, --pattern= 'n1' (N-to-1 shared file) or 'nn' (N-to-N file per process) - (default: 'n1') - -P, --prdwr use pread|pwrite instead of read|write - (default: off) - -S, --stdio use fread|fwrite instead of read|write - (default: off) - -U, --disable-unifyfs do not use UnifyFS - (default: enable UnifyFS) - -v, --verbose print verbose information - (default: off) - -V, --vecio use readv|writev instead of read|write - (default: off) - -x, --shuffle read different data than written - (default: off) + -a, --appid= use given application id + (default: 0) + -A, --aio use asynchronous I/O instead of read|write + (default: off) + -b, --blocksize= I/O block size + (default: 16 MiB) + -c, --chunksize= I/O chunk size for each operation + (default: 1 MiB) + -d, --debug for debugging, wait for input (at rank 0) at start + (default: off) + -f, --file= target file name (or path) under mountpoint + (default: 'testfile') + -k, --check check data contents upon read + (default: off) + -L, --listio use lio_listio instead of read|write + (default: off) + -m, --mount= use for unifyfs + (default: /unifyfs) + -M, --mpiio use MPI-IO instead of POSIX I/O + (default: off) + -n, --nblocks= count of blocks each process will read|write + (default: 32) + -N, --mapio use mmap instead of read|write + (default: off) + -o, --outfile= output file name (or path) + (default: 'stdout') + -p, --pattern= 'n1' (N-to-1 shared file) or 'nn' (N-to-N file per process) + (default: 'n1') + -P, --prdwr use pread|pwrite instead of read|write + (default: off) + -S, --stdio use fread|fwrite instead of read|write + (default: off) + -t, --pre-truncate= truncate file to size (B) before writing + (default: off) + -T, --post-truncate= truncate file to size (B) after writing + (default: off) + -U, --disable-unifyfs do not use UnifyFS + (default: enable UnifyFS) + -v, --verbose print verbose information + (default: off) + -V, --vecio use readv|writev instead of read|write + (default: off) + -x, --shuffle read different data than written + (default: off) One form of running this example could be: diff --git a/docs/images/UnifyFS-developers-documentation.png b/docs/images/UnifyFS-developers-documentation.png new file mode 100644 index 000000000..557a4c4ab Binary files /dev/null and b/docs/images/UnifyFS-developers-documentation.png differ diff --git a/docs/images/UnifyFS-logo.png b/docs/images/UnifyFS-logo.png new file mode 100644 index 000000000..fa374b5a0 Binary files /dev/null and b/docs/images/UnifyFS-logo.png differ diff --git a/docs/images/UnifyFS-logo_sml.png b/docs/images/UnifyFS-logo_sml.png new file mode 100644 index 000000000..acd6b2a4d Binary files /dev/null and b/docs/images/UnifyFS-logo_sml.png differ diff --git a/docs/images/design-high-lvl.png b/docs/images/design-high-lvl.png index 897fd9f6c..a3f23dbfa 100644 Binary files a/docs/images/design-high-lvl.png and b/docs/images/design-high-lvl.png differ diff --git a/docs/images/unify-logo.png b/docs/images/unify-logo.png deleted file mode 100644 index c6a1a6049..000000000 Binary files a/docs/images/unify-logo.png and /dev/null differ diff --git a/docs/overview.rst b/docs/overview.rst index df735beb0..b98f035ff 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -11,6 +11,17 @@ easily as they do the parallel file system. This section will provide a high level design of UnifyFS. It will describe the UnifyFS library and the UnifyFS daemon. +The file system that UnifyFS instantiates only exists in user space and is +only visible to applications linked against the UnifyFS client library. Since +traditional file system tools (ls, cd, etc.) are not linked against the +UnifyFS client library they cannot see files within UnifyFS nor can they be +used to manipulate files within UnifyFS. Each UnifyFS file system lasts as +long as the server processes are running, which is typically as long as the +job they are running within. When the servers exit the file system is +deleted. It is therefore the user's responsibility to copy out files that +need to be persisted to another permanent file system. We provide an API and +a utility to make this easier. + --------------------------- High Level Design --------------------------- diff --git a/docs/slides/UnifyFS-developers-documentation.pdf b/docs/slides/UnifyFS-developers-documentation.pdf new file mode 100644 index 000000000..652af5fc2 Binary files /dev/null and b/docs/slides/UnifyFS-developers-documentation.pdf differ diff --git a/docs/start-stop.rst b/docs/start-stop.rst index e7e5947e3..ac9e894a9 100644 --- a/docs/start-stop.rst +++ b/docs/start-stop.rst @@ -40,17 +40,17 @@ for further details on customizing the UnifyFS runtime configuration. .. code-block:: Bash :linenos: - #!/bin/bash + #!/bin/bash - # spillover checkpoint data to node-local ssd storage - export UNIFYFS_SPILLOVER_DATA_DIR=/mnt/ssd/$USER/data - export UNIFYFS_SPILLOVER_META_DIR=/mnt/ssd/$USER/meta + # spillover data to node-local ssd storage + export UNIFYFS_LOGIO_SPILL_DIR=/mnt/ssd/$USER/data - # store server logs in job-specific scratch area - export UNIFYFS_LOG_DIR=$JOBSCRATCH/logs + # store server logs in job-specific scratch area + export UNIFYFS_LOG_DIR=$JOBSCRATCH/logs - unifyfs start --share-dir=/path/to/shared/file/system & + unifyfs start --share-dir=/path/to/shared/file/system +.. _unifyfs_utility_label: ``unifyfs`` provides command-line options to choose the client mountpoint, adjust the consistency model, and control stage-in and stage-out of files. @@ -59,30 +59,31 @@ The full usage for ``unifyfs`` is as follows: .. code-block:: Bash :linenos: - [prompt]$ unifyfs --help + [prompt]$ unifyfs --help - Usage: unifyfs [options...] + Usage: unifyfs [options...] - should be one of the following: - start start the UnifyFS server daemons - terminate terminate the UnifyFS server daemons + should be one of the following: + start start the UnifyFS server daemons + terminate terminate the UnifyFS server daemons - Common options: - -d, --debug enable debug output - -h, --help print usage + Common options: + -d, --debug enable debug output + -h, --help print usage - Command options for "start": - -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit - -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX) - -e, --exe= [OPTIONAL] where unifyfsd is installed - -m, --mount= [OPTIONAL] mount UnifyFS at - -s, --script= [OPTIONAL] to custom launch script - -S, --share-dir= [REQUIRED] shared file system for use by servers - -i, --stage-in= [OPTIONAL] stage in file(s) at - -o, --stage-out= [OPTIONAL] stage out file(s) to on termination + Command options for "start": + -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX) + -e, --exe= [OPTIONAL] where unifyfsd is installed + -m, --mount= [OPTIONAL] mount UnifyFS at + -s, --script= [OPTIONAL] to custom launch script + -t, --timeout= [OPTIONAL] wait until all servers become ready + -S, --share-dir= [REQUIRED] shared file system for use by servers + -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit + -i, --stage-in= [OPTIONAL] stage in manifest file(s) at - Command options for "terminate": - -s, --script= to custom termination script + Command options for "terminate": + -s, --script= [OPTIONAL] to custom termination script + -o, --stage-out= [OPTIONAL] stage out manifest file(s) at After UnifyFS servers have been successfully started, you may run your @@ -100,3 +101,45 @@ use ``unifyfs terminate`` to terminate the servers. Typically, one would pass the ``--cleanup`` option to ``unifyfs start`` to have the servers remove temporary data locally stored on each node after termination. +------------------------------------ + Resource Manager Job Integration +------------------------------------ + +UnifyFS includes optional support for integrating directly with compatible +resource managers to automatically start and stop servers at the beginning +and end of a job when requested by users. Resource manager integration +requires administrator privileges to deploy. + +Currently, only IBM's Platform LSF with Cluster System Manager (LSF-CSM) +is supported. LSF-CSM is the resource manager on the CORAL2 IBM systems +at ORNL and LLNL. The required job prologue and epilogue scripts, along +with a README documenting the installation instructions, is available +within the source repository at ``util/scripts/lsfcsm``. + +Support for the SLURM resource manager is under development. + +----------------------------------------------- + Stage-in and Stage-out Manifest File Format +----------------------------------------------- + +The manifest file contains one or more file copy requests. +Each line in the manifest corresponds to one file copy request, +and contains both the source and destination file paths. Currently, +directory copies are not supported. + +Each line is formatted as . +If either of the filenames +contain whitespace or special characters, then both filenames should +be surrounded by double-quote characters (") (ASCII character 34 decimal). +The double-quote character and the linefeed end-of-line character are forbidden +in any filenames used in a unifyfs manifest file, but any other +characters are allowed, including control characters. +If a filename contains any characters that might be misinterpreted, then +enclosing the filename in double-quotes is always +a safe thing to do. + +Here is an example of a valid stage-in manifest file: + +``/scratch/users/me/input_data/input_1.dat /unifyfs/input/input_1.dat`` +``/home/users/me/configuration/run_12345.conf /unifyfs/config/run_12345.conf`` +``"/home/users/me/file with space.dat" "/unifyfs/file with space.dat"`` diff --git a/docs/testing.rst b/docs/testing.rst index 5622a8c1a..c427e1ad2 100644 --- a/docs/testing.rst +++ b/docs/testing.rst @@ -55,6 +55,24 @@ Here is an example of a sharness test: test "P" == "NP" ' + # Various tests available to use inside test_expect_success/failure + test_expect_success "Show various available tests" ' + test_path_is_dir /somedir + test_must_fail test_dir_is_empty /somedir + test_path_is_file /somedir/somefile + ' + + # Use test_set_prereq/test_have_prereq to conditionally skip tests + [[ -n $(which h5cc 2>/dev/null) ]] && test_set_prereq HAVE_HDF5 + if test_have_prereq HAVE_HDF5; then + # run HDF5 tests + fi + + # Can also check for prereq in individual test + test_expect_success HAVE_HDF5 "Run HDF5 test" ' + # Run HDF5 test + ' + test_done .. _C-tests-label: @@ -62,8 +80,9 @@ Here is an example of a sharness test: C Program Tests ^^^^^^^^^^^^^^^ -C programs use the `libtap library`_ to implement test cases. Convenience -functions common to test cases written in C are implemented in the library +C programs use the `libtap library`_ to implement test cases. All available +testing functions are viewable in the `libtap README`_. Convenience functions +common to test cases written in C are implemented in the library `lib/testutil.c`_. If your C program needs to use environment variables set by sharness, it can be wrapped in a shell script that first sources `sharness.d/00-test-env.sh`_ and `sharness.d/01-unifyfs-settings.sh`_. Your @@ -74,12 +93,13 @@ The most common way to implement a test with libtap is to use the ``ok()`` function. TODO test cases that demonstrate known breakage are surrounded by the libtap library calls ``todo()`` and ``end_todo()``. -Here is an example libtap test: +Here are some examples of libtap tests: .. code-block:: C :linenos: #include "t/lib/tap.h" + #include "t/lib/testutil.h" #include int main(int argc, char *argv[]) @@ -89,16 +109,88 @@ Here is an example libtap test: result = (1 == 1); ok(result, "1 equals 1: %d", result); + /* Or put a function call directly in test */ + ok(somefunc() == 42, "somefunc() returns 42"); + ok(somefunc() == -1, "somefunc() should fail"); + + /* Use pass/fail for more complex code paths */ + int x = somefunc(); + if (x > 0) { + pass("somefunc() returned a valid value"); + } else { + fail("somefunc() returned an invalid value"); + } + + /* Use is/isnt for string comparisions */ + char buf[64] = {0}; + ok(fread(buf, 12, 1, fd) == 1, "read 12 bytes into buf); + is(buf, "hello world", "buf is \"hello world\""); + + /* Use cmp_mem to test first n bytes of memory */ + char* a = "foo"; + char* b = "bar"; + cmp_mem(a, b, 3); + + /* Use like/unlike to string match to a POSIX regex */ + like("stranger", "^s.(r).*\\1$", "matches the regex"); + + /* Use dies_ok/lives_ok to test whether code causes an exit */ + dies_ok({int x = 0/0;}, "divide by zero crashes"); + + /* Use todo for failing tests to be notified when they start passing */ todo("Prove this someday"); result = strcmp("P", "NP"); ok(result == 0, "P equals NP: %d", result); end_todo; - done_testing(); + /* Use skip/end_skip when a feature isn't implemented yet, or to + conditionally skip when a resource isn't available */ + skip(TRUE, 2, "Reason for skipping tests"); + ok(1); + ok(2); + end_skip; + + #ifdef HAVE_SOME_FEATURE + ok(somefunc()); + ok(someotherfunc()); + #else + skip(TRUE, 2, "Don't have SOME_FEATURE"); + end_skip; + #endif - return 0; + done_testing(); } +.. tip:: + + Including the file and line number, as well as any useful variable values, + in each test output can be very helpful when a test fails or needs to be + debugged. + + .. code-block:: C + + ok(somefunc() == 42, "%s:%d somefunc() returns 42", __FILE__, + __LINE__); + + Also note that ``errno`` is only set when an error occurs and is never set + back to ``0`` implicitly by the system. + When testing for a failure and using ``errno`` as part of the test, + setting ``errno = 0`` before the test will ensure a previous test error + will not affect the current test. In the following example, we also + assign ``errno`` to another variable ``err`` for use in constructing the + test message. This is needed because the ``ok()`` macro may use system + calls that set ``errno``. + + .. code-block:: C + + int err, rc; + errno = 0; + rc = systemcall(); + err = errno; + ok(rc == -1 && err == ENOTTY, + "%s:%d systemcall() should fail (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + ------------ Adding Tests @@ -195,11 +287,33 @@ in (i.e., testing a wrapper that doesn't have any tests yet): Running the Tests ***************** -To manually run the UnifyFS test suite, simply run ``make check`` from your -build/t directory. If changes are made to existing files in the test suite, the -tests can be run again by simply doing ``make clean`` followed by ``make -check``. Individual tests may be run by hand. The test ``0001-setup.t`` should -normally be run first to start the UnifyFS daemon. +To manually run the UnifyFS unit test suite, simply run ``make check`` from the +inside the t/ directory of wherever you built UnifyFS. E.g., if you built in a +separate build/ directory, then do: + +.. code-block:: BASH + + $ cd build/t + $ make check + +If on a system where jobs are launched on a separate compute node, then use your +systems local MPI job launch command to run the unit tests: + +.. code-block:: BASH + + $ cd build/t + $ srun -N1 -n1 make check + +If changes are made to existing files in the test suite, the tests can be run +again by simply doing ``make clean`` followed by another ``make check``. + +Individual tests may be run by hand. The test *0001-setup.t* should +normally be run first to start the UnifyFS daemon. E.g., to run just the +*0100-sysio-gotcha.t* tests, do: + +.. code-block:: BASH + + $ make check TESTS='0001-setup.t 0100-sysio-gotcha.t 9010-stop-unifyfsd.t 9999-cleanup.t' .. note:: @@ -312,217 +426,268 @@ documentation. ------------ -Running the Tests -***************** +Configuration Variables +*********************** -.. attention:: +Along with the already provided :doc:`configuration` options/environment +variables, there are environment variables used by the integration testing +suite that can also be set in order to change the default behavior. - UnifyFS's integration test suite requires MPI and currently only supports - ``srun`` and ``jsrun`` MPI launch commands. Changes are coming to support - ``mpirun``. +Key Variables +^^^^^^^^^^^^^ -UnifyFS's integration tests are primarly set up to be run all as one suite. -However, they can be run individually if desired. +These environment variables can be set prior to sourcing the *t/ci/001-setup.sh* +script and will affect how the overall integration suite operates. -The testing scripts in `t/ci`_ depend on sharness_, which is set up in the -containing *t/* directory. These tests will not function properly if moved or if -they cannot find the sharness files. +``UNIFYFS_INSTALL`` +""""""""""""""""""" -.. important:: +USAGE: ``UNIFYFS_INSTALL=/path/to/dir/containing/UnifyFS/bin/directory`` - Whether running all tests or individual tests, first make sure you have - either interactively allocated nodes or are submitting a batch job to run - them. +The full path to the directory containing the *bin/* and *libexec/* directories +for your UnifyFS installation. Set this envar to prevent the integration tests +from searching for a UnifyFS installation automatically. Where the automatic +search starts can be altered by setting the ``$BASE_SEARCH_DIR`` variable. - Also make sure all :ref:`dependencies ` are installed and - loaded. +``UNIFYFS_CI_NPROCS`` +""""""""""""""""""""" -By default, the integration tests will use the number of processes-per-node as -there are nodes allocated for the job (i.e., if 4 nodes were allocated, then 4 -processes will be run per node). This can be changed by setting the -:ref:`$CI_NPROCS ` environment variable. +USAGE: ``UNIFYFS_CI_NPROCS=`` -.. note:: +The number of processes to use per node inside a job allocation. This defaults +to 1 process per node. This can be adjusted if more processes are desired +on multiple nodes or multiple processes are desired on a single node. - In order to run the the integration tests from a Spack_ installation of - UnifyFS, you'll need to tell Spack to use a different location for staging - builds in order to have the source files available from inside an allocation. +``UNIFYFS_CI_TEMP_DIR`` +""""""""""""""""""""""" - Open your Spack config file +USAGE: ``UNIFYFS_CI_TEMP_DIR=/path/for/temporary/files/created/by/UnifyFS`` - ``spack config edit config`` +Can be used as a shortcut to set ``UNIFYFS_RUNSTATE_DIR`` and +``UNIFYFS_META_DB_PATH`` to the same path. This envar defaults to +``UNIFYFS_CI_TEMP_DIR=${TMPDIR}/unifyfs.${USER}.${JOB_ID}``. - and provide a path that is visible during job allocations: +``UNIFYFS_CI_LOG_CLEANUP`` +"""""""""""""""""""""""""" - .. code-block:: yaml +USAGE: ``UNIFYFS_CI_LOG_CLEANUP=yes|YES|no|NO`` - config: - build_stage: - - /visible/path/from/all/allocated/nodes - # or build directly inside Spack's install directory - - $spack/var/spack/stage +In the event ``$UNIFYFS_LOG_DIR`` has **not** been set, the logs will be put in +``$SHARNESS_TRASH_DIRECTORY``, as set up by sharness.sh_, and cleaned up +automatically after the tests have run. The logs will be in a +*_/* subdirectory. Should any tests fail, sharness does not +clean up the trash directory for debugging purposes. Setting +``UNIFYFS_CI_LOG_CLEANUP=no|NO`` will move the *_/* logs +directory to ``$UNIFYFS_CI_DIR`` (the directory containing the integration +testing scripts) to allow them to persist even when all tests pass. This envar +defauls to ``yes``. - Then make sure to include the ``--keep-stage`` option when installing: +.. note:: - ``spack install --keep-stage unifyfs`` + Setting ``$UNIFYFS_LOG_DIR`` will put all created logs in the designated path + and will not clean them up. -Running All Tests -^^^^^^^^^^^^^^^^^ +``UNIFYFS_CI_HOST_CLEANUP`` +""""""""""""""""""""""""""" -To run all of the tests, simply run ``./RUN_CI_TESTS.sh``. +USAGE: ``UNIFYFS_CI_HOST_CLEANUP=yes|YES|no|NO`` -.. code-block:: BASH +After all tests have run, the nodes on which the tests were ran will +automatically be cleaned up. This cleanup includes ensuring ``unifyfsd`` has +stopped and deleting any files created by UnifyFS or its dependencies. Set +``UNIFYFS_CI_HOST_CLEANUP=no|NO`` to skip cleaning up. This envar defaults to +``yes``. - $ ./RUN_CI_TESTS.sh +.. note:: -or + PDSH_ is required for cleanup and cleaning up is simply skipped if not + found. -.. code-block:: BASH +``UNIFYFS_CI_CLEANUP`` +"""""""""""""""""""""" - $ prove -v RUN_CI_TESTS.sh +USAGE: ``UNIFYFS_CI_CLEANUP=yes|YES|no|NO`` -Running Individual Tests -^^^^^^^^^^^^^^^^^^^^^^^^ +Setting this to ``no|NO`` sets both ``$CI_LOG_CLEANUP`` and +``$UNIFYFS_CI_HOST_CLEANUP`` to ``no|NO``. -In order to run individual tests, testing functions and variables need to be set -up first, and the UnifyFS server needs to be started. To do this, first source -the *t/ci/001-setup.sh* script followed by *002-start-server.sh*. Then source -each desired test script after that preceded by ``$CI_DIR/``. When finished, -source the *990-stop-server.sh* script last to stop the server and clean up. +``UNIFYFS_CI_TEST_POSIX`` +""""""""""""""""""""""""" -.. code-block:: BASH +USAGE: ``UNIFYFS_CI_TEST_POSIX=yes|YES|no|NO`` - $ . full/path/to/001-setup.sh - $ . $CI_DIR/002-start-server.sh - $ . $CI_DIR/100-writeread-tests.sh - $ . $CI_DIR/990-stop-server.sh +Determines whether any ``-posix`` tests should be run since they +require a real mountpoint to exist. -Configuration Variables -^^^^^^^^^^^^^^^^^^^^^^^ +This envar defaults to ``yes``. However, when ``$UNIFYFS_MOUNTPOINT`` is set to a +real directory, this envar is switched to ``no``. The idea behind this is that +the tests can be run a first time with a fake mountpoint (which will also run +the posix tests), and then the tests can be run again with a real mountpoint and +the posix tests won't be run twice. This behavior can be overridden by setting +``UNIFYFS_CI_TEST_POSIX=yes|YES`` before running the integration tests when +``$UNIFYFS_MOUNTPOINT`` is set to an existing directory. -Along with the already provided :doc:`configuration` options/environment -variables, there are available environment variables used by the integration -testing suite that can be set in order to change the default behavior. They are -listed below in the order they are set up. +An example of testing a posix example can be see :ref:`below `. -``CI_PROJDIR`` -"""""""""""""" +.. note:: -USAGE: ``CI_PROJDIR=/base/location/to/search/for/UnifyFS/source/files`` + The posix mountpoint envar, ``UNIFYFS_CI_POSIX_MP``, is set to be located + inside ``$SHARNESS_TRASH_DIRECTORY`` automatically and cleaned up + afterwards. However, this envar can be set before running the integration + tests as well. If setting this, ensure that it is a shared file system that + all allocated nodes can see. -During setup, the integration tests will search for the ``unifyfsd`` executable -and installed example scripts if the UnifyFS install directory is not provided by -the user with the ``UNIFYFS_INSTALL`` envar. ``CI_PROJDIR`` is the base location -where this search will start and defaults to ``CI_PROJDIR=$HOME``. +Additional Variables +^^^^^^^^^^^^^^^^^^^^ +After sourcing the *t/ci/001-setup.sh* script there will be additional variables +available that may be useful when writing/adding additional tests. -``UNIFYFS_INSTALL`` +Directory Structure """"""""""""""""""" -USAGE: ``UNIFYFS_INSTALL=/path/to/dir/containing/UnifyFS/bin/directory`` +File structure here is assuming UnifyFS was cloned to ``$HOME``. -The full path to the directory containing the *bin/* and *libexec/* directories -for a UnifyFS installation. Set this envar to prevent the integration tests from -searching for a UnifyFS install directory automatically. +``UNIFYFS_CI_DIR`` + Directory containing the CI testing scripts. *$HOME/UnifyFS/t/ci/* +``SHARNESS_DIR`` + Directory containing the base sharness scripts. *$HOME/UnifyFS/t/* +``UNIFYFS_SOURCE_DIR`` + Directory containing the UnifyFS source code. *$HOME/UnifyFS/* +``BASE_SEARCH_DIR`` + Parent directory containing the UnifyFS source code. Starting place to auto + search for UnifyFS install when ``$UNIFYFS_INSTALL`` isn't provided. *$HOME/* -.. _ci-nprocs-label: +Executable Locations +"""""""""""""""""""" -``CI_NPROCS`` -""""""""""""" +``UNIFYFS_BIN`` + Directory containing ``unifyfs`` and ``unifyfsd``. *$UNIFYFS_INSTALL/bin* +``UNIFYFS_EXAMPLES`` + Directory containing the compiled examples_. *$UNIFYFS_INSTALL/libexec* -USAGE: ``CI_NPROCS=`` +Resource Managers +""""""""""""""""" -The number of processes to use per node inside a job allocation. This defaults -to the number of processes per node as there are nodes in the allocation (i.e., -if 4 nodes were allocated, then 4 processes will be run per node). This should -be adjusted if fewer processes are desired on multiple nodes, multiple processes -are desired on a single node, or a large number of nodes have been allocated. +``JOB_RUN_COMMAND`` + The base MPI job launch command established according to the detected + resource manager, number of allocated nodes, and ``$UNIFYFS_CI_NPROCS``. + + The LSF variables below will also affect the default version of this command + when using that resource manager. +``JOB_RUN_ONCE_PER_NODE`` + MPI job launch command to only run a single process on each allocated node + established according to the detected resource manager. +``JOB_ID`` + The ID assigned to the current CI job as established by the detected + resource manager. + +LSF +""" + +Additional variables used by the LSF resource manager to determine how jobs are +launched with ``$JOB_RUN_COMMAND``. These can also be set prior to sourcing the +*t/ci/001-setup.sh* script and will affect how the integration tests run. + +``UNIFYFS_CI_NCORES`` + Number of cores-per-resource-set to use. Defaults to 20. +``UNIFYFS_CI_NRS_PER_NODE`` + Number of resource-sets-per-node to use. Defaults to 1. +``UNIFYFS_CI_NRES_SETS`` + Total number of resource sets to use. Defaults to (number_of_nodes) * + (``$UNIFYFS_CI_NRS_PER_NODE``). + +Misc +"""" + +``KB`` + :math:`2^10`. +``MB`` + :math:`2^20`. +``GB`` + :math:`2^30`. -``CI_LOG_CLEANUP`` -"""""""""""""""""" +------------ -USAGE: ``CI_LOG_CLEANUP=yes|YES|no|NO`` +Running the Tests +***************** -In the event ``$UNIFYFS_LOG_DIR`` has **not** been set, the logs will be put in -``$SHARNESS_TRASH_DIRECTORY``, as set up by sharness.sh_, and cleaned up -automatically after the tests have run. The logs will be in a -*_/* subdirectory. Should any tests fail, the trash -directory will not be cleaned up for debugging purposes. Setting -``CI_LOG_CLEANUP=no|NO`` will move the *_/* logs directory -to ``$CI_DIR`` (the directory containing the integration tests) to -allow them to persist even when all tests pass. This envar defauls to ``yes``. +.. attention:: -.. note:: + UnifyFS's integration test suite requires MPI and currently only supports + ``srun`` and ``jsrun`` MPI launch commands. Changes are coming to support + ``mpirun``. - Setting ``$UNIFYFS_LOG_DIR`` will put all created logs in the designated path - and will not clean them up. +UnifyFS's integration tests are primarly set up to be run all as one suite. +However, they can be run individually if desired. -``CI_HOST_CLEANUP`` -""""""""""""""""""" +The testing scripts in `t/ci`_ depend on sharness_, which is set up in the +containing *t/* directory. These tests will not function properly if moved or if +they cannot find the sharness files. -USAGE: ``CI_HOST_CLEANUP=yes|YES|no|NO`` +Whether running all tests or individual tests, first make sure you have +either interactively allocated nodes or are submitting a batch job to run +them. -After all tests have run, the nodes on which the tests were ran will -automatically be cleaned up. This cleanup includes ensuring ``unifyfsd`` has -stopped and deleting any files created by UnifyFS or its dependencies. Set -``CI_HOST_CLEANUP=no|NO`` to skip cleaning up. This envar defaults to ``yes``. +Make sure all :ref:`dependencies ` are installed and loaded. .. note:: - PDSH_ is required for cleanup and cleaning up is simply skipped if not - found. + In order to run the the integration tests from a Spack_ installation of + UnifyFS, you'll need to tell Spack to use a different location for staging + builds in order to have the source files available from inside an allocation. + + Open your Spack config file -``CI_CLEANUP`` -"""""""""""""" + ``spack config edit config`` -USAGE: ``CI_CLEANUP=yes|YES|no|NO`` + and provide a path that is visible during job allocations: -Setting this to ``no|NO`` sets both ``$CI_LOG_CLEANUP`` and ``$CI_HOST_CLEANUP`` -to ``no|NO``. + .. code-block:: yaml -``CI_TEMP_DIR`` -"""""""""""""""" + config: + build_stage: + - /visible/path/from/all/allocated/nodes + # or build directly inside Spack's install directory + - $spack/var/spack/stage -USAGE: ``CI_TEMP_DIR=/path/for/temporary/files/created/by/UnifyFS`` + Then make sure to include the ``--keep-stage`` option when installing: -Can be used as a shortcut to set ``UNIFYFS_RUNSTATE_DIR`` and -``UNIFYFS_META_DB_PATH`` to the same path. This envar defaults to -``CI_TEMP_DIR=${TMPDIR}/unifyfs.${USER}.${JOB_ID}``. + ``spack install --keep-stage unifyfs`` -``CI_STORAGE_DIR`` -""""""""""""""""""" +Running All Tests +^^^^^^^^^^^^^^^^^ -USAGE: ``CI_STORAGE_DIR=/path/for/storage/files/`` +To run all of the tests, simply run ``./RUN_CI_TESTS.sh``. -Can be used as a shortcut to set ``UNIFYFS_SPILLOVER_DATA_DIR`` and -``UNIFYFS_SPILLOVER_META_DIR`` to the same path. This envar defaults to -``CI_STORAGE_DIR=${TMPDIR}/unifyfs.${USER}.${JOB_ID}``. +.. code-block:: BASH -``CI_TEST_POSIX`` -""""""""""""""""" + $ ./RUN_CI_TESTS.sh -USAGE: ``CI_TEST_POSIX=yes|YES|no|NO`` +or -Determines whether any ``-posix`` tests should be run since they -require a real mountpoint to exist. +.. code-block:: BASH -This envar defaults to ``yes``. However, when ``$UNIFYFS_MOUNTPOINT`` is set to a -real directory, this envar is switched to ``no``. The idea behind this is that -the tests can be run a first time with a fake mountpoint (which will also run -the posix tests), and then the tests can be run again with a real mountpoint and -the posix tests wont be run twice. This behavior can be overridden by setting -``CI_TEST_POSIX=yes|YES`` before running the integration tests when -``$UNIFYFS_MOUNTPOINT`` is set to an existing directory. + $ prove -v RUN_CI_TESTS.sh -An example of testing a posix example can be see :ref:`below `. +Running Individual Tests +^^^^^^^^^^^^^^^^^^^^^^^^ -.. note:: +In order to run individual tests, the testing functions and variables need to be +set up first and then the UnifyFS server needs to be started. + +First source the *t/ci/001-setup.sh* script whereafter sharness will change +directories to the ``$SHARNESS_TRASH_DIRECTORY``. To account for this, source +*002-start-server.sh* and each desired test script after that prefixed with +``$UNIFYFS_CI_DIR/``. When finished, source the *990-stop-server.sh* script +last to stop the server and clean up. - The the posix mountpoint envar, ``CI_POSIX_MP``, is set up inside - ``$SHARNESS_TRASH_DIRECTORY`` automatically and cleaned up afterwards. - However, this envar can be set before running the integration tests as well. - If setting this, ensure that it is a shared file system that all allocated - nodes can see. +.. code-block:: BASH + + $ . ./001-setup.sh + $ . $UNIFYFS_CI_DIR/002-start-server.sh + $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh + $ . $UNIFYFS_CI_DIR/990-stop-server.sh ------------ @@ -542,12 +707,12 @@ as simple as possible. One particularly useful function is ``unify_run_test()``. Currently, this function is set up to work for the *write*, *read*, *writeread*, and *checkpoint-restart* examples. This function sets up the MPI job run command and -default arguments as well as any default arguments wanted by all examples. See +default options as well as any default arguments wanted by all examples. See :ref:`below ` for details. .. _helper-label: -Example Helper Functions +Testing Helper Functions ^^^^^^^^^^^^^^^^^^^^^^^^ There are helper functions available in `t/ci/ci-functions.sh`_ that can make @@ -567,14 +732,15 @@ example with the appropriate MPI runner and args. This function is meant to make running the cr, write, read, and writeread examples as easy as possible. The ``build_test_command()`` function is called by this function which -automatically sets any options that are always wanted (-vkf as well as -U and +automatically sets any options that are always wanted (-vkfo as well as -U and the appropriate -m if posix test or not). The stderr output file is also created (based on the filename that is autogenerated) and the appropriate option is set for the MPI job run command. -Args that can be passed in are ([-pncbx][-A|-M|-P|-S|-V]). All other args (see -:ref:`Running the Examples `) are set automatically, including the -filename (which is generated based on the input ``$app_name`` and ``$app_args``). +Args that can be passed in are ([-pncbx][-A|-M|-N|-P|-S|-V]). All other args +(see :ref:`Running the Examples `) are set automatically, +including the outfile and filename (which are generated based on the input +``$app_name`` and ``$app_args``). The third parameter is an optional "pass-by-reference" parameter that can contain the variable name for the resulting output to be stored in, allowing @@ -626,18 +792,18 @@ The results can then be tested with sharness_: USAGE: ``get_filename app_name app_args [app_suffix]`` -Builds and returns the filename for an example so that if it shows up in the -``$UNIFYFS_MOUNTPOINT`` (when using an existing mountpoint), it can be tracked -to its originating test for debugging. Error files are created with this -filename and a ``.err`` suffix and placed in the logs directory for debugging. +Builds and returns the filename with the provided suffix based on the input +app_name and app_args. + +The filename in ``$UNIFYFS_MOUNTPOINT`` will be given a ``.app`` suffix. -Also allows testers to get what the filename will be in advance if called +This allows tests to get what the filename will be in advance if called from a test suite. This can be used for posix tests to ensure the file showed -up in the mount point, as well as for cp/stat tests that potentially need the -filename from a previous test. +up in the mount point, as well as for read, cp, stat tests that potentially need +the filename from a previous test prior to running. -Note that the filename created by ``unify_run_test()`` will have a ``.app`` -suffix. +Error logs and outfiles are also created with this filename, with a ``.err`` or +``.out`` suffix respectively, and placed in the logs directory. Returns a string with the spaces removed and hyphens replaced by underscores. @@ -649,7 +815,8 @@ Returns a string with the spaces removed and hyphens replaced by underscores. Some uses cases may be: - posix tests where the file existence is checked for after a test was run -- cp/stat tests where an already existing filename from a prior test is needed +- read, cp, or stat tests where an already existing filename from a prior test + might be needed For example: @@ -672,9 +839,35 @@ For example: test_expect_success POSIX "$app_name $app_args: (line_count=$line_count, rc=$rc)" ' test $rc = 0 && test $line_count = 8 && - test_path_has_file_per_process $CI_POSIX_MP $filename + test_path_has_file_per_process $UNIFYFS_CI_POSIX_MP $filename ' +Additional Functions +"""""""""""""""""""" + +There are other convenience functions used bythat my be helpful in writing/adding tests are also +found in `t/ci/ci-functions.sh`_: + +``find_executable()`` + USAGE: ``find_executable abs_path *file_name|*path/file_name [prune_path]`` + + Locate the desired executable file when provided an absolute path of where + to start searching, the name of the file with an optional preceding path, + and an optional prune_path, or path to omit from the search. + + Returns the path of the first executable found with the given name and + optional prefix. +``elapsed_time()`` + USAGE: ``elapsed_time start_time_in_seconds end_time_in_seconds`` + + Calculates the elapsed time between two given times. + + Returns the elapsed time formatted as HH:MM:SS. +``format_bytes()`` + USAGE: ``format_bytes int`` + + Returns the input bytes formatted as KB, MB, or GB (1024 becomes 1KB). + Sharness Helper Functions ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -699,7 +892,6 @@ Expects two arguments: - $2 - Number of seconds to wait before giving up .. code-block:: BASH - :emphasize-lines: test_expect_success "unifyfsd is running" ' process_is_running unifyfsd 5 @@ -797,6 +989,7 @@ comments in `t/ci/ci-functions.sh`_. .. _GitLab: https://about.gitlab.com .. _examples: https://github.com/LLNL/UnifyFS/tree/dev/examples/src .. _libtap library: https://github.com/zorgnax/libtap +.. _libtap README: https://github.com/zorgnax/libtap/blob/master/README.md .. _lib/testutil.c: https://github.com/LLNL/UnifyFS/blob/dev/t/lib/testutil.c .. _PDSH: https://github.com/chaos/pdsh .. _sharness: https://github.com/chriscool/sharness diff --git a/examples/src/Makefile.am b/examples/src/Makefile.am index 6835decac..33c40e54e 100644 --- a/examples/src/Makefile.am +++ b/examples/src/Makefile.am @@ -1,21 +1,63 @@ libexec_PROGRAMS = \ - cr-posix cr-gotcha cr-static \ - read-posix read-gotcha read-static \ - write-posix write-gotcha write-static \ - writeread-posix writeread-gotcha writeread-static \ - sysio-write-gotcha sysio-write-static \ - sysio-read-gotcha sysio-read-static \ - sysio-writeread-gotcha sysio-writeread-static sysio-writeread-posix \ - sysio-writeread2-gotcha sysio-writeread2-static \ - sysio-dir-gotcha sysio-dir-static \ - sysio-stat-gotcha sysio-stat-static \ - sysio-cp-gotcha sysio-cp-static \ - app-mpiio-gotcha app-mpiio-static \ - app-btio-gotcha app-btio-static \ - app-tileio-gotcha app-tileio-static \ - transfer-gotcha transfer-static \ - size-gotcha size-static \ - chmod-gotcha chmod-static + cr-posix \ + read-posix \ + write-posix \ + writeread-posix \ + sysio-writeread-posix + +if HAVE_LD_WRAP + libexec_PROGRAMS += \ + cr-static \ + read-static \ + write-static \ + writeread-static \ + sysio-write-static \ + sysio-read-static \ + sysio-writeread-static \ + sysio-writeread2-static \ + sysio-dir-static \ + sysio-stat-static \ + sysio-cp-static \ + sysio-truncate-static \ + sysio-unlink-static \ + sysio-open-static \ + app-mpiio-static \ + app-btio-static \ + app-tileio-static \ + transfer-static \ + size-static \ + simul-static \ + chmod-static \ + multi-write-static \ + read-data-static +endif + +if HAVE_GOTCHA + libexec_PROGRAMS += \ + cr-gotcha \ + read-gotcha \ + write-gotcha \ + writeread-gotcha \ + sysio-write-gotcha \ + sysio-read-gotcha \ + sysio-writeread-gotcha \ + sysio-writeread2-gotcha \ + sysio-dir-gotcha \ + sysio-stat-gotcha \ + sysio-cp-gotcha \ + sysio-truncate-gotcha \ + sysio-unlink-gotcha \ + sysio-open-gotcha \ + app-mpiio-gotcha \ + app-btio-gotcha \ + app-tileio-gotcha \ + transfer-gotcha \ + size-gotcha \ + simul-gotcha \ + chmod-gotcha \ + multi-write-gotcha \ + read-data-gotcha +endif if HAVE_FORTRAN libexec_PROGRAMS += \ @@ -40,25 +82,26 @@ noinst_HEADERS = \ test_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ -I$(top_srcdir)/client/src -I$(top_srcdir)/common/src +if USE_PMPI_WRAPPERS +test_cppflags += -DENABLE_MPI_MOUNT +endif + if HAVE_FORTRAN test_ftn_flags = $(AM_FCFLAGS) $(MPI_FFLAGS) \ -I$(top_srcdir)/client/src -I$(top_srcdir)/common/src test_ftn_ldadd = $(top_builddir)/client/src/libunifyfsf.la -lrt -lm $(FCLIBS) -test_ftn_ldflags = $(AM_LDFLAGS) $(MPI_FLDFLAGS) \ - $(FLATCC_LDFLAGS) $(FLATCC_LIBS) +test_ftn_ldflags = $(AM_LDFLAGS) $(MPI_FLDFLAGS) endif test_gotcha_ldadd = $(top_builddir)/client/src/libunifyfs_gotcha.la -lrt -lm -test_gotcha_ldflags = $(AM_LDFLAGS) $(MPI_CLDFLAGS) \ - $(FLATCC_LDFLAGS) $(FLATCC_LIBS) +test_gotcha_ldflags = $(AM_LDFLAGS) $(MPI_CLDFLAGS) test_posix_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) -DDISABLE_UNIFYFS test_posix_ldadd = -lrt -lm test_posix_ldflags = $(AM_LDFLAGS) $(MPI_CLDFLAGS) test_static_ldadd = $(top_builddir)/client/src/libunifyfs.la -lrt -lm -test_static_ldflags = -static $(CP_WRAPPERS) $(AM_LDFLAGS) $(MPI_CLDFLAGS) \ - $(FLATCC_LDFLAGS) $(FLATCC_LIBS) +test_static_ldflags = -static $(CP_WRAPPERS) $(AM_LDFLAGS) $(MPI_CLDFLAGS) # Per-target flags begin here @@ -137,6 +180,36 @@ sysio_cp_static_CPPFLAGS = $(test_cppflags) sysio_cp_static_LDADD = $(test_static_ldadd) sysio_cp_static_LDFLAGS = $(test_static_ldflags) +sysio_truncate_gotcha_SOURCES = sysio-truncate.c +sysio_truncate_gotcha_CPPFLAGS = $(test_cppflags) +sysio_truncate_gotcha_LDADD = $(test_gotcha_ldadd) +sysio_truncate_gotcha_LDFLAGS = $(test_gotcha_ldflags) + +sysio_truncate_static_SOURCES = sysio-truncate.c +sysio_truncate_static_CPPFLAGS = $(test_cppflags) +sysio_truncate_static_LDADD = $(test_static_ldadd) +sysio_truncate_static_LDFLAGS = $(test_static_ldflags) + +sysio_unlink_gotcha_SOURCES = sysio-unlink.c +sysio_unlink_gotcha_CPPFLAGS = $(test_cppflags) +sysio_unlink_gotcha_LDADD = $(test_gotcha_ldadd) +sysio_unlink_gotcha_LDFLAGS = $(test_gotcha_ldflags) + +sysio_unlink_static_SOURCES = sysio-unlink.c +sysio_unlink_static_CPPFLAGS = $(test_cppflags) +sysio_unlink_static_LDADD = $(test_static_ldadd) +sysio_unlink_static_LDFLAGS = $(test_static_ldflags) + +sysio_open_gotcha_SOURCES = sysio-open.c +sysio_open_gotcha_CPPFLAGS = $(test_cppflags) +sysio_open_gotcha_LDADD = $(test_gotcha_ldadd) +sysio_open_gotcha_LDFLAGS = $(test_gotcha_ldflags) + +sysio_open_static_SOURCES = sysio-open.c +sysio_open_static_CPPFLAGS = $(test_cppflags) +sysio_open_static_LDADD = $(test_static_ldadd) +sysio_open_static_LDFLAGS = $(test_static_ldflags) + cr_posix_SOURCES = checkpoint-restart.c cr_posix_CPPFLAGS = $(test_posix_cppflags) cr_posix_LDADD = $(test_posix_ldadd) @@ -182,17 +255,17 @@ write_static_CPPFLAGS = $(test_cppflags) write_static_LDADD = $(test_static_ldadd) write_static_LDFLAGS = $(test_static_ldflags) -writeread_posix_SOURCES = writeread.c +writeread_posix_SOURCES = writeread.c testutil.c writeread_posix_CPPFLAGS = $(test_posix_cppflags) writeread_posix_LDADD = $(test_posix_ldadd) writeread_posix_LDFLAGS = $(test_posix_ldflags) -writeread_gotcha_SOURCES = writeread.c +writeread_gotcha_SOURCES = writeread.c testutil.c writeread_gotcha_CPPFLAGS = $(test_cppflags) writeread_gotcha_LDADD = $(test_gotcha_ldadd) writeread_gotcha_LDFLAGS = $(test_gotcha_ldflags) -writeread_static_SOURCES = writeread.c +writeread_static_SOURCES = writeread.c testutil.c writeread_static_CPPFLAGS = $(test_cppflags) writeread_static_LDADD = $(test_static_ldadd) writeread_static_LDFLAGS = $(test_static_ldflags) @@ -270,6 +343,16 @@ size_static_CPPFLAGS = $(test_cppflags) size_static_LDADD = $(test_static_ldadd) size_static_LDFLAGS = $(test_static_ldflags) +simul_gotcha_SOURCES = simul.c +simul_gotcha_CPPFLAGS = $(test_cppflags) +simul_gotcha_LDADD = $(test_gotcha_ldadd) +simul_gotcha_LDFLAGS = $(test_gotcha_ldflags) + +simul_static_SOURCES = simul.c +simul_static_CPPFLAGS = $(test_cppflags) +simul_static_LDADD = $(test_static_ldadd) +simul_static_LDFLAGS = $(test_static_ldflags) + chmod_gotcha_SOURCES = chmod.c testutil.c chmod_gotcha_CPPFLAGS = $(test_cppflags) chmod_gotcha_LDADD = $(test_gotcha_ldadd) @@ -280,3 +363,22 @@ chmod_static_CPPFLAGS = $(test_cppflags) chmod_static_LDADD = $(test_static_ldadd) chmod_static_LDFLAGS = $(test_static_ldflags) +multi_write_gotcha_SOURCES = multi-write.c testutil.c +multi_write_gotcha_CPPFLAGS = $(test_cppflags) +multi_write_gotcha_LDADD = $(test_gotcha_ldadd) +multi_write_gotcha_LDFLAGS = $(test_gotcha_ldflags) + +multi_write_static_SOURCES = multi-write.c testutil.c +multi_write_static_CPPFLAGS = $(test_cppflags) +multi_write_static_LDADD = $(test_static_ldadd) +multi_write_static_LDFLAGS = $(test_static_ldflags) + +read_data_gotcha_SOURCES = read-data.c +read_data_gotcha_CPPFLAGS = $(test_cppflags) +read_data_gotcha_LDADD = $(test_gotcha_ldadd) +read_data_gotcha_LDFLAGS = $(test_gotcha_ldflags) + +read_data_static_SOURCES = read-data.c +read_data_static_CPPFLAGS = $(test_cppflags) +read_data_static_LDADD = $(test_static_ldadd) +read_data_static_LDFLAGS = $(test_static_ldflags) diff --git a/examples/src/app-btio.c b/examples/src/app-btio.c index 57a3a39da..a238f38da 100644 --- a/examples/src/app-btio.c +++ b/examples/src/app-btio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/examples/src/app-hdf5-create.c b/examples/src/app-hdf5-create.c index bf6d41333..86817b998 100644 --- a/examples/src/app-hdf5-create.c +++ b/examples/src/app-hdf5-create.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + /* * Copyright by The HDF Group. * Copyright by the Board of Trustees of the University of Illinois. @@ -162,18 +163,18 @@ int main(int argc, char** argv) /* Create a new file using default properties. */ file_id = H5Fcreate(targetfile, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); - printf("H5Fcreate: %d\n", file_id); + printf("H5Fcreate: %ld\n", (long) file_id); /* Create the data space for the dataset. */ dims[0] = 4; dims[1] = 6; dataspace_id = H5Screate_simple(2, dims, NULL); - printf("H5Screate_simple: %d\n", dataspace_id); + printf("H5Screate_simple: %ld\n", (long) dataspace_id); /* Create the dataset. */ dataset_id = H5Dcreate2(file_id, "/dset", H5T_STD_I32BE, dataspace_id, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); - printf("H5Dcreate2: %d\n", dataset_id); + printf("H5Dcreate2: %ld\n", (long) dataset_id); if (write_data) { int i, j; diff --git a/examples/src/app-hdf5-writeread.c b/examples/src/app-hdf5-writeread.c index 405888896..f2c371bd8 100644 --- a/examples/src/app-hdf5-writeread.c +++ b/examples/src/app-hdf5-writeread.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + /* * Copyright by The HDF Group. * Copyright by the Board of Trustees of the University of Illinois. @@ -186,11 +187,11 @@ int main(int argc, char** argv) /* Open an existing file. */ file_id = H5Fopen(targetfile, H5F_ACC_RDWR, H5P_DEFAULT); - printf("H5Fopen: %d\n", file_id); + printf("H5Fopen: %ld\n", (long) file_id); /* Open an existing dataset. */ dataset_id = H5Dopen2(file_id, "/dset", H5P_DEFAULT); - printf("H5open2: %d\n", dataset_id); + printf("H5open2: %ld\n", (long) dataset_id); if (!readonly) { /* Write the dataset. */ diff --git a/examples/src/app-mpiio.c b/examples/src/app-mpiio.c index 33efbd8c1..8444134f9 100644 --- a/examples/src/app-mpiio.c +++ b/examples/src/app-mpiio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include diff --git a/examples/src/app-tileio.c b/examples/src/app-tileio.c index 681624be1..53640c140 100644 --- a/examples/src/app-tileio.c +++ b/examples/src/app-tileio.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/examples/src/checkpoint-restart.c b/examples/src/checkpoint-restart.c index 101d89c7f..7099144fd 100644 --- a/examples/src/checkpoint-restart.c +++ b/examples/src/checkpoint-restart.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -177,6 +177,8 @@ int verify_restart_data(test_cfg* cfg, char* data, uint64_t last_ckpt_id) * cfg.use_mapio - support is not yet implemented. When enabled, * direct memory loads and stores will be used for reads and writes. * + * cfg.use_mpiio - when enabled, MPI-IO will be used. + * * cfg.use_prdwr - when enabled, pread(2) and pwrite(2) will be used. * * cfg.use_stdio - when enabled, fread(2) and fwrite(2) will be used. diff --git a/examples/src/chmod.c b/examples/src/chmod.c index af6ad85d4..e8d0dd7c6 100644 --- a/examples/src/chmod.c +++ b/examples/src/chmod.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -10,7 +10,9 @@ * This is the license for UnifyFS. * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - * + */ + +/* * Test chmod() and fchmod() * * Test description: @@ -23,6 +25,7 @@ * 7. Laminate it using fchmod() * 8. Check file is laminated after */ + #include "testutil.h" int do_test(test_cfg* cfg) @@ -32,7 +35,9 @@ int do_test(test_cfg* cfg) int fd; file = mktemp_cmd(cfg, "/unifyfs"); - assert(file); + if (NULL == file) { + return ENOMEM; + } test_print(cfg, "Create empty file %s", file); test_print(cfg, "Before lamination stat() is:"); @@ -87,9 +92,14 @@ int main(int argc, char* argv[]) fflush(NULL); return rc; } - do_test(cfg); + + rc = do_test(cfg); + if (rc) { + test_print(cfg, "ERROR - Test %s failed! rc=%d", argv[0], rc); + fflush(NULL); + } test_fini(cfg); - return 0; + return rc; } diff --git a/examples/src/multi-write.c b/examples/src/multi-write.c new file mode 100644 index 000000000..369f82499 --- /dev/null +++ b/examples/src/multi-write.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* + * Test doing lots of writes to many open files and verify the data is written + * correctly. This can be used to exercise bugs. + * + * Test description: + * 1. Fill bigbuf[] with repeating A-Z + * 2. Do a bunch of writes with random offsets and lengths to multiple files, + * using bigbuf[] as the data. + * 3. Laminate the files. + * 4. Read them back, and verify the portions that did get written match the + * data from bigbuf[]. + */ + +#include +#include +#include +#include +#include +#include + +#include "testutil.h" + +#define NUM_FILES 10 +#define NUM_WRITES 100 + +/* This is large enough write size to periodically cross 1MB slice boundaries */ +#define MAX_WRITE (1024*1024) +#define SEED 1 + +char bigbuf[1024*1024*10]; +char tmpbuf[1024*1024*10]; + +void fill_bigbuf(void) +{ + char r; + int i; + + /* Fill bigbuf[] repeating A-Z chars */ + for (i = 0; i < sizeof(bigbuf); i++) { + bigbuf[i] = 'A'+ (i % 26); + } +} + +/* Compare a file with the data in bigbuf[] */ +int check_file(char* file) +{ + int fd; + int rc; + int matched = 0; + fd = open(file, O_RDONLY, 0222); + + memset(tmpbuf, 0, sizeof(tmpbuf)); + rc = read(fd, tmpbuf, sizeof(tmpbuf)); + printf("%s: read %d bytes\n", file, rc); + + for (int i = 0; i < rc; i++) { + if (tmpbuf[i] == bigbuf[i]) { + matched++; + } + + if (tmpbuf[i] != bigbuf[i] && tmpbuf[i] != 0) { + printf("%s failed at offset %d (tmpbuf['%c'] != bigbuf['%c'])\n", + file, i, tmpbuf[i], bigbuf[i]); + printf("Comparing last 10 bytes before/after:\n"); + printf("expected: "); + for (int j = i - 10; j < i; j++) { + printf("%c", bigbuf[j] ? bigbuf[j] : ' '); + } + + printf("|%c|", bigbuf[i]); + + for (int j = i + 1; j < i + 11; j++) { + printf("%c", bigbuf[j] ? bigbuf[j] : ' '); + } + printf("\n"); + + printf("got: "); + + for (int j = i - 10; j < i; j++) { + printf("%c", tmpbuf[j] ? tmpbuf[j] : ' '); + } + + printf("|%c|", tmpbuf[i]); + + for (int j = i + 1; j < i + 11; j++) { + printf("%c", tmpbuf[j] ? tmpbuf[j] : ' '); + } + + printf("\n"); + + + return 1; + } + } + if (rc > 0 && matched == 0) { + printf("%s: No matches with file %s\n", __func__, file); + return 1; + } + return 0; +} + +int do_test(test_cfg* cfg) +{ + int rc; + int fds[NUM_FILES], fd; + char* file[NUM_FILES]; + char buf[40] = {0}; + int i; + int rnd; + int start, count; + fill_bigbuf(); + srand(SEED); + + /* Create our files */ + for (i = 0; i < NUM_FILES; i++) { + file[i] = mktemp_cmd(cfg, "/unifyfs"); + fds[i] = open(file[i], O_WRONLY | O_CREAT, 0222); + } + + /* Write our files */ + for (i = 0; i < NUM_WRITES; i++) { + /* Randomly pick one of our files to write to */ + rnd = rand() % NUM_FILES; + fd = fds[rnd]; + + /* Pick a random offset and count */ + start = rand() % (sizeof(bigbuf) - MAX_WRITE); + + /* + 1 so we always write at least 1 byte */ + count = (rand() % (MAX_WRITE-1)) + 1; + lseek(fd, start, SEEK_SET); + if (write(fd, &bigbuf[start], count) != count) { + perror("Couldn't write"); + exit(1); + } + } + + /* Sync extents of all our files and laminate them */ + for (i = 0; i < NUM_FILES; i++) { + rc = fsync(fds[i]); + if (rc != 0) { + printf("%s %d/%d failed to sync, rc = %d, (errno %d %s)\n", + file[i], i+1, NUM_FILES, rc, errno, strerror(errno)); + exit(1); + } + close(fds[i]); + + rc = chmod(file[i], 0444); + if (rc != 0) { + printf("%s failed to chmod, rc = %d\n", file[i], rc); + exit(1); + } + + } + + /* Verify the writes to the files match the values in bigbuf[] */ + for (i = 0; i < NUM_FILES; i++) { + if (check_file(file[i]) != 0) { + printf("file %d/%d failed\n", i+1, NUM_FILES); + exit(1); /* Error */ + } + + free(file[i]); + } + printf("Passed!\n"); +} + +int main(int argc, char* argv[]) +{ + test_cfg test_config; + test_cfg* cfg = &test_config; + int rc; + + rc = test_init(argc, argv, cfg); + if (rc) { + test_print(cfg, "ERROR - Test %s initialization failed!", argv[0]); + fflush(NULL); + return rc; + } + do_test(cfg); + + test_fini(cfg); + + return 0; +} diff --git a/examples/src/read-data.c b/examples/src/read-data.c new file mode 100644 index 000000000..af0e66f61 --- /dev/null +++ b/examples/src/read-data.c @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* read-data: This program aims to test reading data from a file with arbitrary + * offset and length. This program can run either interactively (only + * specifying filename) or non-interactively (specifying filename with offset + * and length). + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "testutil.h" + +#define DEFAULT_MOUNTPOINT "/unifyfs" + +static char filename[PATH_MAX]; +static char mountpoint[PATH_MAX]; + +static char* buf; +static uint64_t bufsize; +static int check; + +static int parse_line(char* line, uint64_t* offset, uint64_t* length) +{ + char* pos = NULL; + + line[strlen(line)-1] = '\0'; + + if (strncmp("quit", line, strlen("quit")) == 0) { + return 1; + } + + pos = strchr(line, ','); + if (!pos) { + return -1; + } + + *pos = '\0'; + pos++; + + *offset = strtoull(line, NULL, 0); + *length = strtoull(pos, NULL, 0); + + return 0; +} + +static void alloc_buf(uint64_t length) +{ + if (!buf) { + bufsize = length; + buf = malloc(bufsize); + } else { + if (bufsize < length) { + buf = realloc(buf, length); + } + } + + if (!buf) { + perror("failed to allocate buffer"); + exit(1); + } +} + +static void aligned_offlen(uint64_t filesize, uint64_t blocksize, + uint64_t* off, uint64_t* len) +{ + uint64_t block_count = filesize / blocksize; + + *off = (random() % (block_count - 1)) * blocksize; + *len = blocksize; +} + + +static void random_offlen(uint64_t filesize, uint64_t maxoff, uint64_t maxlen, + uint64_t* off, uint64_t* len) +{ + uint64_t _off; + uint64_t _len; + + _off = random() % maxoff; + _len = random() % maxlen; + + while (_off + _len > filesize) { + _len = _len / 2 + 1; + } + + *len = _len; + *off = _off; +} + +static void do_pread(int fd, size_t length, off_t offset) +{ + ssize_t ret = 0; + struct timespec ts1, ts2; + double ts1nsec, ts2nsec; + double elapsed_sec, mbps; + + alloc_buf(length); + + errno = 0; + + clock_gettime(CLOCK_REALTIME, &ts1); + + ret = pread(fd, buf, length, offset); + + clock_gettime(CLOCK_REALTIME, &ts2); + + ts1nsec = 1e9 * 1.0 * ts1.tv_sec + 1.0 * ts1.tv_nsec; + ts2nsec = 1e9 * 1.0 * ts2.tv_sec + 1.0 * ts2.tv_nsec; + elapsed_sec = (ts2nsec - ts1nsec) / (1e9); + + mbps = (1.0 * length / (1<<20)) / elapsed_sec; + + printf(" -> pread(off=%lu, len=%lu) = %zd", offset, length, ret); + if (errno) { + printf(" (err=%d, %s)\n", errno, strerror(errno)); + } else { + printf(" (%.3f sec, %.3lf MB/s)\n", elapsed_sec, mbps); + + if (check) { + uint64_t error_offset; + ret = lipsum_check(buf, length, offset, &error_offset); + if (ret < 0) { + printf(" * data verification failed at offset %" PRIu64 "\n", + error_offset); + } else { + printf(" * data verification success\n"); + } + } + } +} + +static void run_interactive(int fd) +{ + int ret = 0; + uint64_t offset = 0; + uint64_t length = 0; + char* line = NULL; + char linebuf[LINE_MAX]; + + while (1) { + printf("\nread (offset,length)> "); + fflush(stdout); + line = fgets(linebuf, LINE_MAX-1, stdin); + if (!line) { + continue; + } + + ret = parse_line(line, &offset, &length); + if (ret < 0) { + continue; + } else if (1 == ret) { + printf("terminating..\n"); + break; + } + + do_pread(fd, length, offset); + } +} + +static struct option long_opts[] = { + { "help", 0, 0, 'h' }, + { "check", 0, 0, 'c' }, + { "mount", 1, 0, 'm' }, + { "offset", 1, 0, 'o' }, + { "length", 1, 0, 'l' }, + { "random", 1, 0, 'r' }, + { "max-offset", 1, 0, 'O' }, + { "max-length", 1, 0, 'L' }, + { "aligned", 1, 0, 'a' }, + { 0, 0, 0, 0}, +}; + +static char* short_opts = "hcm:o:l:r:O:L:f:"; + +static const char* usage_str = +"\n" +"Usage: %s [options...] \n" +"\n" +"Test reading data from a file . should be a full\n" +"pathname. If running without --offset and --length options, this will run\n" +"in an interactive mode where the offset and length should be specified\n" +"with a separating comma between them, e.g., ','.\n" +"'quit' will terminate the program.\n" +"\n" +"Available options:\n" +" -h, --help help message\n" +" -c, --check verify data content. data should be written using\n" +" the write example with --check option\n" +" -m, --mount= use for unifyfs (default: /unifyfs)\n" +" -o, --offset= read from \n" +" -l, --length= read bytes\n" +" -r, --random= generate random offset and length times,\n" +" only workin in the non-interactive mode\n" +" -O, --max-offset= generate a random offset not exceeding \n" +" -L, --max-length= generate a random length not exceeding \n" +" -f, --aligned= generate requests aligned with a blocksize \n" +"\n"; + +static char* program; + +static void print_usage(void) +{ + printf(usage_str, program); + exit(0); +} + +int main(int argc, char** argv) +{ + int ret = 0; + int ch = 0; + int optidx = 0; + int unifyfs = 0; + int fd = -1; + int random = 0; + uint64_t offset = 0; + uint64_t length = 0; + uint64_t maxoff = 0; + uint64_t maxlen = 0; + uint64_t aligned = 0; + uint64_t filesize = 0; + struct stat sb; + char* tmp_program = NULL; + + tmp_program = strdup(argv[0]); + if (!tmp_program) { + perror("failed to allocate memory"); + return -1; + } + + program = basename(tmp_program); + + while ((ch = getopt_long(argc, argv, + short_opts, long_opts, &optidx)) >= 0) { + switch (ch) { + case 'c': + check = 1; + break; + + case 'm': + sprintf(mountpoint, "%s", optarg); + break; + + case 'o': + offset = strtoull(optarg, NULL, 0); + break; + + case 'l': + length = strtoull(optarg, NULL, 0); + break; + + case 'r': + random = atoi(optarg); + break; + + case 'O': + maxoff = strtoull(optarg, NULL, 0); + break; + + case 'L': + maxlen = strtoull(optarg, NULL, 0); + break; + + case 'a': + aligned = strtoull(optarg, NULL, 0); + break; + + case 'h': + default: + print_usage(); + break; + } + } + + if (argc - optind != 1) { + print_usage(); + return -1; + } + + sprintf(filename, "%s", argv[optind]); + + if (mountpoint[0] == '\0') { + sprintf(mountpoint, "%s", DEFAULT_MOUNTPOINT); + } + + if (strncmp(filename, mountpoint, strlen(mountpoint)) == 0) { + printf("mounting unifyfs at %s ..\n", mountpoint); + + ret = unifyfs_mount(mountpoint, 0, 1, 0); + if (ret) { + fprintf(stderr, "unifyfs_mount failed (return = %d)\n", ret); + return -1; + } + + unifyfs = 1; + } + + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror("open failed"); + return -1; + } + + ret = stat(filename, &sb); + if (ret < 0) { + perror("stat failed"); + goto out; + } + + filesize = sb.st_size; + printf("%s (size = %lu)\n", filename, filesize); + + if (offset == 0 && length == 0 && random == 0) { + run_interactive(fd); + } else { + if (random) { + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); + srandom(ts.tv_nsec % ts.tv_sec); + + if (0 == maxoff) { + maxoff = filesize / 2; + } + + if (0 == maxlen) { + maxlen = filesize / 2; + } + + for (int i = 0; i < random; i++) { + if (aligned) { + aligned_offlen(filesize, aligned, &offset, &length); + } else { + random_offlen(filesize, maxoff, maxlen, &offset, &length); + } + do_pread(fd, length, offset); + } + } else { + do_pread(fd, length, offset); + } + } + + ret = 0; +out: + close(fd); + + if (buf) { + free(buf); + } + + if (unifyfs) { + unifyfs_unmount(); + } + + if (tmp_program) { + free(tmp_program); + } + + return ret; +} diff --git a/examples/src/read.c b/examples/src/read.c index c69d2b749..0e94719e8 100644 --- a/examples/src/read.c +++ b/examples/src/read.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -89,6 +89,8 @@ size_t generate_read_reqs(test_cfg* cfg, char* dstbuf, * cfg.use_mapio - support is not yet implemented. When enabled, * direct memory loads will be used for reads. * + * cfg.use_mpiio - when enabled, MPI-IO will be used. + * * cfg.use_prdwr - when enabled, pread(2) will be used. * * cfg.use_stdio - when enabled, fread(3) will be used. @@ -112,11 +114,17 @@ int main(int argc, char* argv[]) test_cfg test_config; test_cfg* cfg = &test_config; + test_timer time_stat; + test_timer time_open; test_timer time_rd; test_timer time_check; + test_timer time_close; + timer_init(&time_stat, "stat"); + timer_init(&time_open, "open"); timer_init(&time_rd, "read"); timer_init(&time_check, "check"); + timer_init(&time_close, "close"); rc = test_init(argc, argv, cfg); if (rc) { @@ -138,6 +146,7 @@ int main(int argc, char* argv[]) target_file); // file size check + timer_start_barrier(cfg, &time_stat); size_t rank_bytes = test_config.n_blocks * test_config.block_sz; size_t total_bytes = rank_bytes * test_config.n_ranks; size_t expected = total_bytes; @@ -158,12 +167,17 @@ int main(int argc, char* argv[]) } } } + timer_stop_barrier(cfg, &time_stat); + test_print_verbose_once(cfg, "DEBUG: finished stat"); // open file + timer_start_barrier(cfg, &time_open); rc = test_open_file(cfg, target_file, O_RDONLY); if (rc) { test_abort(cfg, rc); } + timer_stop_barrier(cfg, &time_open); + test_print_verbose_once(cfg, "DEBUG: finished open"); // generate read requests test_print_verbose_once(cfg, "DEBUG: generating read requests"); @@ -178,8 +192,7 @@ int main(int argc, char* argv[]) // do reads test_print_verbose_once(cfg, "DEBUG: starting read requests"); - test_barrier(cfg); - timer_start(&time_rd); + timer_start_barrier(cfg, &time_rd); rc = issue_read_req_batch(cfg, num_reqs, reqs); if (rc) { test_abort(cfg, rc); @@ -188,18 +201,17 @@ int main(int argc, char* argv[]) if (rc) { test_abort(cfg, rc); } - timer_stop(&time_rd); + timer_stop_barrier(cfg, &time_rd); test_print_verbose_once(cfg, "DEBUG: finished read requests"); // check file data - test_barrier(cfg); test_print_verbose_once(cfg, "DEBUG: starting data check"); - timer_start(&time_check); + timer_start_barrier(cfg, &time_check); rc = check_read_req_batch(cfg, num_reqs, reqs); if (rc) { test_abort(cfg, rc); } - timer_stop(&time_check); + timer_stop_barrier(cfg, &time_check); test_print_verbose_once(cfg, "DEBUG: finished data check"); // post-read cleanup @@ -208,10 +220,13 @@ int main(int argc, char* argv[]) reqs = NULL; // close file + timer_start_barrier(cfg, &time_close); rc = test_close_file(cfg); if (rc) { test_abort(cfg, rc); } + timer_stop_barrier(cfg, &time_close); + test_print_verbose_once(cfg, "DEBUG: finished close"); // calculate achieved bandwidth rates double max_read_time, max_check_time; @@ -232,8 +247,11 @@ int main(int argc, char* argv[]) "Number of processes: %d\n" "Each process wrote: %.2lf MiB\n" "Total data written: %.2lf MiB\n" + "File stat time: %.6lf sec\n" + "File open time: %.6lf sec\n" "Maximum read time: %.6lf sec\n" "Maximum check time: %.6lf sec\n" + "File close time: %.6lf sec\n" "Aggregate read bandwidth: %.3lf MiB/s\n" "Effective read bandwidth: %.3lf MiB/s\n", io_pattern_str(test_config.io_pattern), @@ -242,16 +260,22 @@ int main(int argc, char* argv[]) test_config.n_ranks, bytes_to_mib(rank_bytes), bytes_to_mib(total_bytes), + time_stat.elapsed_sec_all, + time_open.elapsed_sec_all, max_read_time, max_check_time, + time_close.elapsed_sec_all, aggr_read_bw, eff_read_bw); // cleanup free(target_file); + timer_fini(&time_stat); + timer_fini(&time_open); timer_fini(&time_rd); timer_fini(&time_check); + timer_fini(&time_close); test_fini(cfg); diff --git a/examples/src/simul.c b/examples/src/simul.c new file mode 100644 index 000000000..3b1c2fa85 --- /dev/null +++ b/examples/src/simul.c @@ -0,0 +1,1326 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* + * Copyright (C) 2003, The Regents of the University of California. + * Produced at the Lawrence Livermore National Laboratory. + * Written by Christopher J. Morrone + * UCRL-CODE-2003-019 + * All rights reserved. + */ + +/* + * Some modifications including style changes have been made for testing + * unifyfs: + * To test with UnifyFS, pass the '-u' flag. Then, simul will set the test + * directory as /unifyfs, where unifyfs will be accordingly mounted. + */ + +#include "mpi.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* unifyfs test */ +#include + +#define FILEMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH +#define DIRMODE S_IRUSR|S_IXUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IXOTH +#define SHARED 1 +#define MAX_FILENAME_LEN 512 + +int rank; +int size; +char *testdir = NULL; +char hostname[1024]; +int verbose; +int throttle = 1; +struct timeval t1, t2; +static char version[] = "1.16"; + +#ifdef __GNUC__ + /* "inline" is a keyword in GNU C */ +#elif __STDC_VERSION__ >= 199901L + /* "inline" is a keyword in C99 and later versions */ +#else +# define inline /* "inline" not available */ +#endif + +#ifndef AIX +#define FAIL(msg) do { \ + fprintf(stdout, "%s: Process %d(%s): FAILED in %s, %s: %s\n",\ + timestamp(), rank, hostname, __func__, \ + msg, strerror(errno)); \ + fflush(stdout);\ + MPI_Abort(MPI_COMM_WORLD, 1); \ +} while(0) +#else +#define FAIL(msg) do { \ + fprintf(stdout, "%s: Process %d(%s): FAILED, %s: %s\n",\ + timestamp(), rank, hostname, \ + msg, strerror(errno)); \ + fflush(stdout);\ + MPI_Abort(MPI_COMM_WORLD, 1); \ +} while(0) +#endif + +char *timestamp() { + static char datestring[80]; + time_t timestamp; + + fflush(stdout); + timestamp = time(NULL); + strftime(datestring, 80, "%T", localtime(×tamp)); + + return datestring; +} + +static inline void begin(char *str) { + if (verbose > 0 && rank == 0) { + gettimeofday(&t1, NULL); + fprintf(stdout, "%s:\tBeginning %s\n", timestamp(), str); + fflush(stdout); + } +} + +static inline void end(char *str) { + double elapsed; + + MPI_Barrier(MPI_COMM_WORLD); + if (verbose > 0 && rank == 0) { + gettimeofday(&t2, NULL); + elapsed = ((((t2.tv_sec - t1.tv_sec) * 1000000L) + + t2.tv_usec) - t1.tv_usec) + / (double)1000000; + if (elapsed >= 60) { + fprintf(stdout, "%s:\tFinished %-15s(%.2f min)\n", + timestamp(), str, elapsed / 60); + } else { + fprintf(stdout, "%s:\tFinished %-15s(%.3f sec)\n", + timestamp(), str, elapsed); + + } + fflush(stdout); + } +} + +void Seq_begin(MPI_Comm comm, int numprocs) { + int size; + int rank; + int buf; + MPI_Status status; + + MPI_Comm_size(comm, &size); + MPI_Comm_rank(comm, &rank); + + if (rank >= numprocs) { + MPI_Recv(&buf, 1, MPI_INT, rank-numprocs, 1333, comm, &status); + } +} + +void Seq_end(MPI_Comm comm, int numprocs) { + int size; + int rank; + int buf; + + MPI_Comm_size(comm, &size); + MPI_Comm_rank(comm, &rank); + + if ((rank + numprocs) < size) { + MPI_Send(&buf, 1, MPI_INT, rank+numprocs, 1333, comm); + } +} + +/* This function does not FAIL if the requested "name" does not exist. This + is just to clean up any files or directories left over from previous runs.*/ +void remove_file_or_dir(char *name) { + struct stat statbuf; + char errmsg[MAX_FILENAME_LEN+20]; + + if (stat(name, &statbuf) != -1) { + if (S_ISREG(statbuf.st_mode)) { + printf("stale file found\n"); + if (unlink(name) == -1) { + sprintf(errmsg, "unlink of %s", name); + FAIL(errmsg); + } + } + if (S_ISDIR(statbuf.st_mode)) { + printf("stale directory found\n"); + if (rmdir(name) == -1) { + sprintf(errmsg, "rmmdir of %s", name); + FAIL(errmsg); + } + } + } +} + +char *create_files(char *prefix, int filesize, int shared) { + static char filename[MAX_FILENAME_LEN]; + char errmsg[MAX_FILENAME_LEN+20]; + int fd, i; + short zero = 0; + + /* Process 0 creates the test file(s) */ + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + sprintf(filename, "%s/%s.%d", testdir, prefix, i); + remove_file_or_dir(filename); + if ((fd = creat(filename, FILEMODE)) == -1) { + sprintf(errmsg, "creat of file %s", filename); + FAIL(errmsg); + } + if (filesize > 0) { + if (lseek(fd, filesize - 1, SEEK_SET) == -1) { + sprintf(errmsg, "lseek in file %s", filename); + FAIL(errmsg); + } + if (write(fd, &zero, 1) == -1) { + sprintf(errmsg, "write in file %s", filename); + FAIL(errmsg); + } + } + if (close(fd) == -1) { + sprintf(errmsg, "close of file %s", filename); + FAIL(errmsg); + } + } + } + + if (shared) + sprintf(filename, "%s/%s.0", testdir, prefix); + else + sprintf(filename, "%s/%s.%d", testdir, prefix, rank); + + return filename; +} + +void remove_files(char *prefix, int shared) { + char filename[1024]; + int i; + + /* Process 0 removes the file(s) */ + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + sprintf(filename, "%s/%s.%d", testdir, prefix, i); + /*printf("Removing file %s\n", filename); fflush(stdout);*/ + if (unlink(filename) == -1) { + FAIL("unlink failed"); + } + } + } +} + +char *create_dirs(char *prefix, int shared) { + static char dirname[1024]; + int i; + + /* Process 0 creates the test file(s) */ + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + sprintf(dirname, "%s/%s.%d", testdir, prefix, i); + remove_file_or_dir(dirname); + if (mkdir(dirname, DIRMODE) == -1) { + FAIL("init mkdir failed"); + } + } + } + + if (shared) + sprintf(dirname, "%s/%s.0", testdir, prefix); + else + sprintf(dirname, "%s/%s.%d", testdir, prefix, rank); + + return dirname; +} + +void remove_dirs(char *prefix, int shared) { + char dirname[1024]; + int i; + + /* Process 0 removes the file(s) */ + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + sprintf(dirname, "%s/%s.%d", testdir, prefix, i); + if (rmdir(dirname) == -1) { + FAIL("rmdir failed"); + } + } + } +} + +char *create_symlinks(char *prefix, int shared) { + static char filename[1024]; + static char linkname[1024]; + int i; + + /* Process 0 creates the test file(s) */ + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + sprintf(filename, "%s/symlink_target", testdir); + sprintf(linkname, "%s/%s.%d", testdir, prefix, i); + remove_file_or_dir(linkname); + if (symlink(filename, linkname) == -1) { + FAIL("symlink failed"); + } + } + } + + if (shared) + sprintf(linkname, "%s/%s.0", testdir, prefix); + else + sprintf(linkname, "%s/%s.%d", testdir, prefix, rank); + + return linkname; +} + +void check_single_success(char *testname, int rc, int error_rc) { + int *rc_vec, i; + int fail = 0; + int pass = 0; + + if (rank == 0) { + if ((rc_vec = (int *)malloc(sizeof(int)*size)) == NULL) { + FAIL("malloc failed"); + } + } + MPI_Gather(&rc, 1, MPI_INT, rc_vec, 1, MPI_INT, 0, MPI_COMM_WORLD); + if (rank == 0) { + for (i = 0; i < size; i++) { + if (rc_vec[i] == error_rc) + fail++; + else + pass++; + } + if (!((pass == 1) && (fail == size-1))) { + fprintf(stdout, "%s: FAILED in %s: ", timestamp(), testname); + if (pass > 1) + fprintf(stdout, "too many operations succeeded (%d).\n", pass); + else + fprintf(stdout, "too many operations failed (%d).\n", fail); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); + } + free(rc_vec); + } +} + +void simul_open(int shared) { + int fd; + char *filename; + + begin("setup"); + filename = create_files("simul_open", 0, shared); + end("setup"); + + /* All open the file simultaneously */ + begin("test"); + if ((fd = open(filename, O_RDWR)) == -1) { + FAIL("open failed"); + } + end("test"); + + /* All close the file one at a time */ + begin("cleanup"); + Seq_begin(MPI_COMM_WORLD, throttle); + if (close(fd) == -1) { + FAIL("close failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_files("simul_open", shared); + end("cleanup"); +} + +void simul_close(int shared) { + int fd; + char *filename; + + begin("setup"); + filename = create_files("simul_close", 0, shared); + MPI_Barrier(MPI_COMM_WORLD); + /* All open the file one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if ((fd = open(filename, O_RDWR)) == -1) { + FAIL("open failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + end("setup"); + + begin("test"); + /* All close the file simultaneously */ + if (close(fd) == -1) { + FAIL("close failed"); + } + end("test"); + + begin("cleanup"); + remove_files("simul_close", shared); + end("cleanup"); +} + +void simul_chdir(int shared) { + char cwd[1024]; + char *dirname; + + begin("setup"); + if (getcwd(cwd, 1024) == NULL) { + FAIL("init getcwd failed"); + } + dirname = create_dirs("simul_chdir", shared); + end("setup"); + + begin("test"); + /* All chdir to dirname */ + if (chdir(dirname) == -1) { + FAIL("chdir failed"); + } + end("test"); + + begin("cleanup"); + /* All chdir back to old cwd */ + if (chdir(cwd) == -1) { + FAIL("chdir back failed"); + } + MPI_Barrier(MPI_COMM_WORLD); + remove_dirs("simul_chdir", shared); + end("cleanup"); +} + +void simul_file_stat(int shared) { + char *filename; + struct stat buf; + + begin("setup"); + filename = create_files("simul_file_stat", 0, shared); + end("setup"); + + begin("test"); + /* All stat the file */ + if (stat(filename, &buf) == -1) { + FAIL("stat failed"); + } + end("test"); + + begin("cleanup"); + remove_files("simul_file_stat", shared); + end("cleanup"); +} + +void simul_dir_stat(int shared) { + char *dirname; + struct stat buf; + + begin("setup"); + dirname = create_dirs("simul_dir_stat", shared); + end("setup"); + + begin("test"); + /* All stat the directory */ + if (stat(dirname, &buf) == -1) { + FAIL("stat failed"); + } + end("test"); + + begin("cleanup"); + remove_dirs("simul_dir_stat", shared); + end("cleanup"); +} + +void simul_readdir(int shared) { + DIR *dir; + char *dirname; + struct dirent *dptr; + + begin("setup"); + dirname = create_dirs("simul_readdir", shared); + MPI_Barrier(MPI_COMM_WORLD); + /* All open the directory(ies) one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if ((dir = opendir(dirname)) == NULL) { + FAIL("init opendir failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + end("setup"); + + begin("test"); + /* All readdir the directory stream(s) */ + if ((dptr = readdir(dir)) == NULL) { + FAIL("readdir failed"); + } + end("test"); + + begin("cleanup"); + /* All close the directory(ies) one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if (closedir(dir) == -1) { + FAIL("closedir failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_dirs("simul_readdir", shared); + end("cleanup"); +} + +void simul_statfs(int shared) { + char *filename; + struct statfs buf; + + begin("setup"); + filename = create_files("simul_statfs", 0, shared); + end("setup"); + + begin("test"); + /* All statfs the file(s) */ + if (statfs(filename, &buf) == -1) { + FAIL("statfs failed"); + } + end("test"); + + begin("cleanup"); + remove_files("simul_statfs", shared); + end("cleanup"); +} + +void simul_lseek(int shared) { + int fd; + char *filename; + + begin("setup"); + filename = create_files("simul_lseek", 0, shared); + MPI_Barrier(MPI_COMM_WORLD); + /* All open the file(s) one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if ((fd = open(filename, O_RDWR)) == -1) { + FAIL("init open failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + end("setup"); + + begin("test"); + /* All lseek simultaneously */ + if (lseek(fd, 1024, SEEK_SET) == -1) { + FAIL("lseek failed"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + end("test"); + + begin("cleanup"); + /* All close the file(s) one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if (close(fd) == -1) { + FAIL("cleanup close failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_files("simul_lseek", shared); + end("cleanup"); +} + +void simul_read(int shared) { + int fd; + ssize_t fin; + char buf[1024]; + char *filename; + int i = 0; + int retry = 100; + + begin("setup"); + filename = create_files("simul_read", 1024, shared); + MPI_Barrier(MPI_COMM_WORLD); + /* All open the file one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if ((fd = open(filename, O_RDWR)) == -1) { + FAIL("init open failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + end("setup"); + + begin("test"); + /* All read simultaneously */ + for (i = 1024; (i > 0) && (retry > 0); i -= fin, retry--) { + if ((fin = read(fd, buf, (size_t)i)) == -1) { + FAIL("read failed"); + } + } + if( (retry == 0) && (i > 0) ) + FAIL("read exceeded retry count"); + end("test"); + + begin("cleanup"); + /* All close the file one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if (close(fd) == -1) { + FAIL("cleanup close failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_files("simul_read", shared); + end("cleanup"); +} + +void simul_write(int shared) { + int fd; + ssize_t fin; + char *filename; + int i = 0; + int retry = 100; + + begin("setup"); + filename = create_files("simul_write", size * sizeof(int), shared); + MPI_Barrier(MPI_COMM_WORLD); + /* All open the file and lseek one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if ((fd = open(filename, O_RDWR)) == -1) { + FAIL("init open failed"); + } + if (lseek(fd, rank*sizeof(int), SEEK_SET) == -1) { + FAIL("init lseek failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + end("setup"); + + begin("test"); + /* All write simultaneously */ + for (i = sizeof(int); (i > 0) && (retry > 0); i -= fin, retry--) { + if ((fin = write(fd, &rank, (size_t)i)) == -1) { + FAIL("write failed"); + } + } + if( (retry == 0) && (i > 0) ) + FAIL("write exceeded retry count"); + end("test"); + + begin("cleanup"); + /* All close the file one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if (close(fd) == -1) { + FAIL("cleanup close failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_files("simul_write", shared); + end("cleanup"); +} + +void simul_mkdir(int shared) { + int rc, i; + char dirname[MAX_FILENAME_LEN]; + + begin("setup"); + if (shared) + sprintf(dirname, "%s/simul_mkdir.0", testdir); + else + sprintf(dirname, "%s/simul_mkdir.%d", testdir, rank); + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + char buf[MAX_FILENAME_LEN]; + sprintf(buf, "%s/simul_mkdir.%d", testdir, i); + remove_file_or_dir(buf); + } + } + end("setup"); + + begin("test"); + /* All mkdir dirname */ + rc = mkdir(dirname, DIRMODE); + if (!shared) { + if (rc == -1) { + FAIL("mkdir failed"); + } + MPI_Barrier(MPI_COMM_WORLD); + } else { /* Only one should succeed */ + check_single_success("simul_mkdir", rc, -1); + } + end("test"); + + begin("cleanup"); + remove_dirs("simul_mkdir", shared); + end("cleanup"); +} + +void simul_rmdir(int shared) { + int rc; + char *dirname; + + begin("setup"); + dirname = create_dirs("simul_rmdir", shared); + MPI_Barrier(MPI_COMM_WORLD); + end("setup"); + + begin("test"); + /* All rmdir dirname */ + rc = rmdir(dirname); + if (!shared) { + if (rc == -1) { + FAIL("rmdir failed"); + } + MPI_Barrier(MPI_COMM_WORLD); + } else { /* Only one should succeed */ + check_single_success("simul_rmdir", rc, -1); + } + end("test"); + + begin("cleanup"); + end("cleanup"); +} + +void simul_creat(int shared) { + int fd, i; + char filename[1024]; + + begin("setup"); + if (shared) + sprintf(filename, "%s/simul_creat.0", testdir); + else + sprintf(filename, "%s/simul_creat.%d", testdir, rank); + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + char buf[MAX_FILENAME_LEN]; + sprintf(buf, "%s/simul_creat.%d", testdir, i); + remove_file_or_dir(buf); + } + } + end("setup"); + + begin("test"); + /* All create the files simultaneously */ + fd = creat(filename, FILEMODE); + if (fd == -1) { + FAIL("creat failed"); + } + end("test"); + + begin("cleanup"); + /* All close the files one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if (close(fd) == -1) { + FAIL("close failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_files("simul_creat", shared); + end("cleanup"); +} + +void simul_unlink(int shared) { + int rc; + char *filename; + + begin("setup"); + filename = create_files("simul_unlink", 0, shared); + end("setup"); + + begin("test"); + /* All unlink the files simultaneously */ + rc = unlink(filename); + if (!shared) { + if (rc == -1) { + FAIL("unlink failed"); + } + } else { + check_single_success("simul_unlink", rc, -1); + } + end("test"); + + begin("cleanup"); + end("cleanup"); +} + +void simul_rename(int shared) { + int rc, i; + char *oldfilename; + char newfilename[1024]; + char *testname = "simul_rename"; + + begin("setup"); + oldfilename = create_files(testname, 0, shared); + sprintf(newfilename, "%s/%s_new.%d", testdir, testname, rank); + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + char buf[MAX_FILENAME_LEN]; + sprintf(buf, "%s/%s_new.%d", testdir, testname, i); + remove_file_or_dir(buf); + } + } + end("setup"); + + begin("test"); + /* All rename the files simultaneously */ + rc = rename(oldfilename, newfilename); + if (!shared) { + if (rc == -1) { + FAIL("stat failed"); + } + } else { + check_single_success(testname, rc, -1); + } + end("test"); + + begin("cleanup"); + if (rc == 0) { + if (unlink(newfilename) == -1) + FAIL("unlink failed"); + } + end("cleanup"); +} + +void simul_truncate(int shared) { + char *filename; + + begin("setup"); + filename = create_files("simul_truncate", 2048, shared); + end("setup"); + + begin("test"); + /* All truncate simultaneously */ + if (truncate(filename, 1024) == -1) { + FAIL("truncate failed"); + } + end("test"); + + begin("cleanup"); + remove_files("simul_truncate", shared); + end("cleanup"); +} + +void simul_readlink(int shared) { + char *linkname; + char buf[1024]; + + begin("setup"); + linkname = create_symlinks("simul_readlink", shared); + end("setup"); + + begin("test"); + /* All read the symlink(s) simultaneously */ + if (readlink(linkname, buf, 1024) == -1) { + FAIL("readlink failed"); + } + end("test"); + + begin("cleanup"); + remove_files("simul_readlink", shared); + end("cleanup"); +} + +void simul_symlink(int shared) { + int rc, i; + char linkname[MAX_FILENAME_LEN]; + char filename[MAX_FILENAME_LEN]; + + begin("setup"); + if (shared) + sprintf(linkname, "%s/simul_symlink.0", testdir); + else + sprintf(linkname, "%s/simul_symlink.%d", testdir, rank); + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + char buf[MAX_FILENAME_LEN]; + sprintf(buf, "%s/simul_symlink.%d", testdir, i); + remove_file_or_dir(buf); + } + } + sprintf(filename, "%s/simul_symlink_target", testdir); + end("setup"); + + begin("test"); + /* All create the symlinks simultaneously */ + rc = symlink(filename, linkname); + if (!shared) { + if (rc == -1) { + FAIL("symlink failed"); + } + } else { + check_single_success("simul_symlink", rc, -1); + } + end("test"); + + begin("cleanup"); + remove_files("simul_symlink", shared); + end("cleanup"); +} + +void simul_link_to_one(int shared) { + int rc, i; + char *filename; + char linkname[1024]; + + begin("setup"); + if (shared) + sprintf(linkname, "%s/simul_link.0", testdir); + else + sprintf(linkname, "%s/simul_link.%d", testdir, rank); + if (rank == 0) { + for (i = 0; i < (shared ? 1 : size); i++) { + char buf[MAX_FILENAME_LEN]; + sprintf(buf, "%s/simul_link.%d", testdir, i); + remove_file_or_dir(buf); + } + } + filename = create_files("simul_link_target", 0, SHARED); + end("setup"); + + begin("test"); + /* All create the hard links simultaneously */ + rc = link(filename, linkname); + if (!shared) { + if (rc == -1) { + FAIL("link failed"); + } + } else { + check_single_success("simul_link_to_one", rc, -1); + } + end("test"); + + begin("cleanup"); + remove_files("simul_link_target", SHARED); + remove_files("simul_link", shared); + end("cleanup"); +} + +void simul_link_to_many(int shared) { + char *filename; + char linkname[1024]; + int i; + + if (shared) { + if (verbose > 0 && rank == 0) + printf("%s:\tThis is just a place holder; no test is run here.\n", + timestamp()); + return; + } + begin("setup"); + filename = create_files("simul_link", 0, shared); + sprintf(linkname, "%s/simul_link_target.%d", testdir, rank); + if (rank == 0) { + for (i = 0; i < size; i++) { + char buf[MAX_FILENAME_LEN]; + sprintf(buf, "%s/simul_link_target.%d", testdir, i); + remove_file_or_dir(buf); + } + } + end("setup"); + + begin("test"); + /* All create the hard links simultaneously */ + if (link(filename, linkname) == -1) { + FAIL("link failed"); + } + end("test"); + + begin("cleanup"); + remove_files("simul_link", shared); + remove_files("simul_link_target", !SHARED); + end("cleanup"); +} + +void simul_fcntl_lock(int shared) { + int rc, fd; + char *filename; + struct flock sf_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0 + }; + struct flock sf_unlock = { + .l_type = F_UNLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0 + }; + + begin("setup"); + filename = create_files("simul_fcntl", 0, shared); + MPI_Barrier(MPI_COMM_WORLD); + /* All open the file one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if ((fd = open(filename, O_RDWR)) == -1) { + FAIL("open failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + end("setup"); + + begin("test"); + /* All lock the file(s) simultaneously */ + rc = fcntl(fd, F_SETLK, &sf_lock); + if (!shared) { + if (rc == -1) { + if (errno == ENOSYS) { + if (rank == 0) { + fprintf(stdout, "WARNING: fcntl locking not supported.\n"); + fflush(stdout); + } + } else { + FAIL("fcntl lock failed"); + } + } + MPI_Barrier(MPI_COMM_WORLD); + } else { + int saved_errno = errno; + int *rc_vec, *er_vec, i; + int fail = 0; + int pass = 0; + int nosys = 0; + if (rank == 0) { + if ((rc_vec = (int *)malloc(sizeof(int)*size)) == NULL) + FAIL("malloc failed"); + if ((er_vec = (int *)malloc(sizeof(int)*size)) == NULL) + FAIL("malloc failed"); + } + MPI_Gather(&rc, 1, MPI_INT, rc_vec, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Gather(&saved_errno, 1, MPI_INT, er_vec, 1, MPI_INT, 0, + MPI_COMM_WORLD); + if (rank == 0) { + for (i = 0; i < size; i++) { + if (rc_vec[i] == -1) { + if (er_vec[i] == ENOSYS) { + nosys++; + } else if (er_vec[i] != EACCES && er_vec[i] != EAGAIN) { + errno = er_vec[i]; + FAIL("fcntl failed as expected, but with wrong errno"); + } + fail++; + } else { + pass++; + } + } + if (nosys == size) { + fprintf(stdout, "WARNING: fcntl locking not supported.\n"); + fflush(stdout); + } else if (!((pass == 1) && (fail == size-1))) { + fprintf(stdout, + "%s: FAILED in simul_fcntl_lock", timestamp()); + if (pass > 1) + fprintf(stdout, + "too many fcntl locks succeeded (%d).\n", pass); + else + fprintf(stdout, + "too many fcntl locks failed (%d).\n", fail); + fflush(stdout); + MPI_Abort(MPI_COMM_WORLD, 1); + } + free(rc_vec); + free(er_vec); + } + } + end("test"); + + begin("cleanup"); + /* All close the file one at a time */ + Seq_begin(MPI_COMM_WORLD, throttle); + if (!shared || rank == 0) { + rc = fcntl(fd, F_SETLK, &sf_unlock); + if (rc == -1 && errno != ENOSYS) + FAIL("fcntl unlock failed"); + } + if (close(fd) == -1) { + FAIL("close failed"); + } + Seq_end(MPI_COMM_WORLD, throttle); + MPI_Barrier(MPI_COMM_WORLD); + remove_files("simul_fcntl", shared); + end("cleanup"); +} + +struct test { + char *name; + void (*function) (int); + int simul; /* Flag designating support for simultaneus mode */ + int indiv; /* Flag designating support for individual mode */ +}; + +static struct test testlist[] = { + {"open", simul_open}, + {"close", simul_close}, + {"file stat", simul_file_stat}, + {"lseek", simul_lseek}, + {"read", simul_read}, + {"write", simul_write}, + {"chdir", simul_chdir}, + {"directory stat", simul_dir_stat}, + {"statfs", simul_statfs}, + {"readdir", simul_readdir}, + {"mkdir", simul_mkdir}, + {"rmdir", simul_rmdir}, + {"unlink", simul_unlink}, + {"rename", simul_rename}, + {"creat", simul_creat}, + {"truncate", simul_truncate}, + {"symlink", simul_symlink}, + {"readlink", simul_readlink}, + {"link to one file", simul_link_to_one}, + {"link to a file per process", simul_link_to_many}, + {"fcntl locking", simul_fcntl_lock}, + {0} +}; + +/* Searches an array of ints for one int. A "-1" must mark the end of the + array. */ +int int_in_list(int item, int *list) { + int *ptr; + + if (list == NULL) + return 0; + ptr = list; + while (*ptr != -1) { + if (*ptr == item) + return 1; + ptr += 1; + } + return 0; +} + +/* Breaks string of comma-sperated ints into an array of ints */ +int *string_split(char *string) { + char *ptr; + char *tmp; + int excl_cnt = 1; + int *list; + int i; + + ptr = string; + while((tmp = strchr(ptr, ','))) { + ptr = tmp + 1; + excl_cnt++; + } + + list = (int *)malloc(sizeof(int) * (excl_cnt + 1)); + if (list == NULL) FAIL("malloc failed"); + + tmp = (strtok(string, ", ")); + if (tmp == NULL) FAIL("strtok failed"); + list[0] = atoi(tmp); + for (i = 1; i < excl_cnt; i++) { + tmp = (strtok(NULL, ", ")); + if (tmp == NULL) FAIL("strtok failed"); + list[i] = atoi(tmp); + } + list[i] = -1; + + return list; +} + +void print_help(int testcnt) { + int i; + + if (rank == 0) { + printf("simul-%s\n", version); + printf("Usage: simul [-h] -d [-f firsttest] [-l lasttest]\n"); + printf(" [-n #] [-N #] [-i \"4,7,13\"] [-e \"6,22\"] [-s]\n"); + printf(" [-v] [-V #]\n"); + printf("\t-h: prints this help message\n"); + printf("\t-d: the directory in which the tests will run\n"); + printf("\t-f: the number of the first test to run (default: 0)\n"); + printf("\t-l: the number of the last test to run (default: %d)\n", + (testcnt*2)-1); + printf("\t-i: comma-sperated list of tests to include\n"); + printf("\t-e: comma-sperated list of tests to exclude\n"); + printf("\t-s: single-step through every iteration of every test\n"); + printf("\t-n: repeat each test # times (default: 1)\n"); + printf("\t-N: repeat the entire set of tests # times (default: 1)\n"); + printf("\t-v: increases the verbositly level by 1\n"); + printf("\t-u: test with unifyfs\n"); + printf("\t-V: select a specific verbosity level\n"); + printf("\nThe available tests are:\n"); + for (i = 0; i < testcnt * 2; i++) { + printf("\tTest #%d: %s, %s mode.\n", i, + testlist[i%testcnt].name, + (i < testcnt) ? "shared" : "individual"); + } + } + + MPI_Initialized(&i); + if (i) MPI_Finalize(); + exit(0); +} + +int main(int argc, char **argv) { + int testcnt; + int first; + int last; + int i, j, k, c; + int *excl_list = NULL; + int *incl_list = NULL; + int test; + int singlestep = 0; + int iterations = 1; + int set_iterations = 1; + int unifyfs = 0; + int ret = 0; + char linebuf[80]; + + /* Check for -h parameter before MPI_Init so the simul binary can be + called directly, without, for instance, mpirun. */ + for (testcnt = 1; testlist[testcnt].name != 0; testcnt++) continue; + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { + print_help(testcnt); + } + } + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + if (rank == 0) { + printf("Simul is running with %d process(es)\n", size); + fflush(stdout); + } + + first = 0; + last = testcnt * 2; + + /* Parse command line options */ + while (1) { + c = getopt(argc, argv, "d:e:f:hi:l:n:N:suvV:"); + if (c == -1) + break; + + switch (c) { + case 'd': + testdir = optarg; + break; + case 'e': + excl_list = string_split(optarg); + break; + case 'f': + first = atoi(optarg); + if (first >= last) { + printf("Invalid parameter, firsttest must be <= lasttest\n"); + MPI_Abort(MPI_COMM_WORLD, 2); + } + break; + case 'h': + print_help(testcnt); + break; + case 'i': + incl_list = string_split(optarg); + break; + case 'l': + last = atoi(optarg)+1; + if (last <= first) { + printf("Invalid parameter, lasttest must be >= firsttest\n"); + MPI_Abort(MPI_COMM_WORLD, 2); + } + break; + case 'n': + iterations = atoi(optarg); + break; + case 'N': + set_iterations = atoi(optarg); + break; + case 's': + singlestep = 1; + break; + case 'u': + unifyfs = 1; + break; + case 'v': + verbose += 1; + break; + case 'V': + verbose = atoi(optarg); + break; + } + } + + /* mount the unifyfs and use the testdir as the mountpoint. + * if testdir is not specified, use '/unifyfs.' */ + if (unifyfs) { + int ret = 0; + + if (!testdir) { + testdir = "/unifyfs"; + } + ret = unifyfs_mount(testdir, rank, size, 0); + if (ret && rank == 0) { + printf("unifyfs_mount failed (ret=%d)\n", ret); + MPI_Abort(MPI_COMM_WORLD, 2); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (testdir == NULL && rank == 0) { + printf("Please specify a test directory! (\"simul -h\" for help)\n"); + MPI_Abort(MPI_COMM_WORLD, 2); + } + + if (gethostname(hostname, 1024) == -1) { + perror("gethostname"); + MPI_Abort(MPI_COMM_WORLD, 2); + } + + /* If a list of tests was not specified with the -i option, then use + the first and last number to build a range of included tests. */ + if (incl_list == NULL) { + incl_list = (int *)malloc(sizeof(int) * (2+last-first)); + for (i = 0; i < last-first; i++) { + incl_list[i] = i + first; + } + incl_list[i] = -1; + } + + /* Run the tests */ + for (k = 0; k < set_iterations; k++) { + if ((rank == 0) && (set_iterations > 1)) + printf("%s: Set iteration %d\n", timestamp(), k); + for (i = 0; ; ++i) { + test = incl_list[i]; + if (test == -1) + break; + if (!int_in_list(test, excl_list)) { + for (j = 0; j < iterations; j++) { + if (singlestep) { + if (rank == 0) + printf("%s: Hit to run test #%d(iter %d): %s, %s mode.\n", + timestamp(), test, j, + testlist[test%testcnt].name, + (test < testcnt) ? "shared" : "individual"); + fgets(linebuf, 80, stdin); + } + if (rank == 0) { + printf("%s: Running test #%d(iter %d): %s, %s mode.\n", + timestamp(), test, j, testlist[test%testcnt].name, + (test < testcnt) ? "shared" : "individual"); + fflush(stdout); + } + testlist[test%testcnt].function((test < testcnt) ? SHARED : !SHARED); + MPI_Barrier(MPI_COMM_WORLD); + } + } + } + } + + if (rank == 0) printf("%s: All tests passed!\n", timestamp()); + + /* unmount unifyfs */ + if (unifyfs) { + unifyfs_unmount(); + } + + MPI_Finalize(); + exit(0); +} diff --git a/examples/src/size.c b/examples/src/size.c index e57db50aa..5cd677442 100644 --- a/examples/src/size.c +++ b/examples/src/size.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -10,7 +10,9 @@ * This is the license for UnifyFS. * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - * + */ + +/* * Test file size functions * * Test description: @@ -19,6 +21,7 @@ * 3. Have the last rank laminate the file. * 4. Check the file size again. It should be the real, laminated, file size. */ + #include "testutil.h" int do_test(test_cfg* cfg) @@ -27,18 +30,19 @@ int do_test(test_cfg* cfg) int rank = cfg->rank; file = mktemp_cmd(cfg, "/unifyfs"); - assert(file); - - test_print(cfg, "I'm writing 1KB to %s at my offset at %ld", - file, rank * 1024); + if (NULL == file) { + return ENOMEM; + } + test_print(cfg, "I'm writing 1 KiB to %s at my offset at %ld", + file, rank * 1024); dd_cmd(cfg, "/dev/zero", file, 1024, 1, rank); test_print(cfg, "Stating the file"); stat_cmd(cfg, file); + test_print(cfg, "After writing, file size is %lu, apparent-size %lu", - du_cmd(cfg, file, 0), - du_cmd(cfg, file, 1)); + du_cmd(cfg, file, 0), du_cmd(cfg, file, 1)); /* sync our extents */ sync_cmd(cfg, file); @@ -51,9 +55,9 @@ int do_test(test_cfg* cfg) /* laminate by removing write bits */ chmod(file, 0444); /* set to read-only */ - test_print(cfg, "After lamination, file size is %lu", - du_cmd(cfg, file, 0)); + du_cmd(cfg, file, 0)); + test_print(cfg, "Stating the file"); stat_cmd(cfg, file); } @@ -72,9 +76,14 @@ int main(int argc, char* argv[]) fflush(NULL); return rc; } - do_test(cfg); + + rc = do_test(cfg); + if (rc) { + test_print(cfg, "ERROR - Test %s failed! rc=%d", argv[0], rc); + fflush(NULL); + } test_fini(cfg); - return 0; + return rc; } diff --git a/examples/src/sysio-cp.c b/examples/src/sysio-cp.c index 62693fed1..0c04bf1ec 100644 --- a/examples/src/sysio-cp.c +++ b/examples/src/sysio-cp.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include diff --git a/examples/src/sysio-dir.c b/examples/src/sysio-dir.c index 868870821..5b37bf26d 100644 --- a/examples/src/sysio-dir.c +++ b/examples/src/sysio-dir.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include diff --git a/examples/src/sysio-open.c b/examples/src/sysio-open.c new file mode 100644 index 000000000..f23d3da54 --- /dev/null +++ b/examples/src/sysio-open.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "testlib.h" + +static int fd; /* target file descriptor */ +static int standard; /* not mounting unifyfs when set */ + +static int rank; +static int total_ranks; + +static int create_rank; +static int open_rank; +static int debug; /* pause for attaching debugger */ +static int unmount; /* unmount unifyfs after running the test */ +static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ +static char* filename = "testfile"; /* testfile name under mountpoint */ +static char targetfile[NAME_MAX]; /* target file name */ + +static struct option long_opts[] = { + { "create", 1, 0, 'c' }, + { "debug", 0, 0, 'd' }, + { "filename", 1, 0, 'f' }, + { "help", 0, 0, 'h' }, + { "mount", 1, 0, 'm' }, + { "open", 1, 0, 'o' }, + { "standard", 0, 0, 's' }, + { "unmount", 0, 0, 'u' }, + { 0, 0, 0, 0}, +}; + +static char* short_opts = "c:df:hm:o:su"; + +static const char* usage_str = + "\n" + "Usage: %s [options...]\n" + "\n" + "Available options:\n" + " -c, --create= create the file from \n" + " (default: 0)\n" + " -d, --debug pause before running test\n" + " (handy for attaching in debugger)\n" + " -f, --filename= target file name under mountpoint\n" + " (default: testfile)\n" + " -h, --help help message\n" + " -m, --mount= use for unifyfs\n" + " (default: /unifyfs)\n" + " -o, --open= open file from after create\n" + " (default: 0)\n" + " -s, --standard do not use unifyfs but run standard I/O\n" + " -u, --unmount unmount the filesystem after test\n" + "\n"; + +static char* program; + +static void print_usage(void) +{ + test_print_once(rank, usage_str, program); + exit(0); +} + +int main(int argc, char** argv) +{ + int ret = 0; + int ch = 0; + int optidx = 2; + + program = basename(strdup(argv[0])); + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + while ((ch = getopt_long(argc, argv, + short_opts, long_opts, &optidx)) >= 0) { + switch (ch) { + case 'c': + create_rank = atoi(optarg); + break; + + case 'f': + filename = strdup(optarg); + break; + + case 'd': + debug = 1; + break; + + case 'm': + mountpoint = strdup(optarg); + break; + + case 'o': + open_rank = atoi(optarg); + break; + + case 's': + standard = 1; + break; + + case 'u': + unmount = 1; + break; + + case 'h': + default: + print_usage(); + break; + } + } + + if (static_linked(program) && standard) { + test_print_once(rank, "--standard, -s option only works when " + "dynamically linked."); + exit(-1); + } + + sprintf(targetfile, "%s/%s", mountpoint, filename); + + if (debug) { + test_pause(rank, "Attempting to mount"); + } + + if (!standard) { + ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + if (ret) { + test_print(rank, "unifyfs_mount failed (return = %d)", ret); + exit(-1); + } + } + + if ((create_rank < 0 || create_rank > total_ranks - 1) || + (open_rank < 0 || open_rank > total_ranks - 1)) { + test_print(rank, "please specify valid rank\n"); + exit(-1); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == open_rank) { + fd = open(targetfile, O_CREAT|O_RDWR|O_TRUNC, 0600); + if (fd < 0) { + test_print(rank, "open failed (%d: %s)\n", + errno, strerror(errno)); + exit(-1); + } + + test_print(rank, "created file %s successfully\n", targetfile); + + close(fd); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == open_rank) { + fd = open(targetfile, O_RDWR); + if (fd < 0) { + test_print(rank, "open failed (%d: %s)\n", + errno, strerror(errno)); + exit(-1); + } + + test_print(rank, "opened file %s successfully\n", targetfile); + + close(fd); + } + + if (!standard && unmount) { + unifyfs_unmount(); + } + + MPI_Finalize(); + + return ret; +} + diff --git a/examples/src/sysio-read.c b/examples/src/sysio-read.c index 1cacbf6f3..fedbc431a 100644 --- a/examples/src/sysio-read.c +++ b/examples/src/sysio-read.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include diff --git a/examples/src/sysio-stat.c b/examples/src/sysio-stat.c index 7f0159c6f..14eee6055 100644 --- a/examples/src/sysio-stat.c +++ b/examples/src/sysio-stat.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include @@ -40,7 +41,8 @@ static int debug; static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ static char* filename = "/unifyfs"; -static int unmount; /* unmount unifyfs after running the test */ +static int unmount; /* unmount unifyfs after running the test */ +static int testrank = -1; /* if negative, execute from all ranks */ #define FP_SPECIAL 1 @@ -108,15 +110,30 @@ static void dump_stat(int rank, const struct stat* sb) printf("Last status change: %s\n\n", ctime(&sb->st_ctime)); } +static void do_stat(int rank) +{ + int ret = 0; + struct stat sb; + + ret = stat(filename, &sb); + if (ret < 0) { + test_print(rank, "stat failed on \"%s\" (%d:%s)", + filename, errno, strerror(errno)); + } else { + dump_stat(rank, &sb); + } +} + static struct option const long_opts[] = { { "debug", 0, 0, 'd' }, { "help", 0, 0, 'h' }, { "mount", 1, 0, 'm' }, { "unmount", 0, 0, 'u' }, + { "rank", 1, 0, 'r' }, { 0, 0, 0, 0}, }; -static char* short_opts = "dhm:u"; +static char* short_opts = "dhm:ur:"; static const char* usage_str = "\n" @@ -129,6 +146,7 @@ static const char* usage_str = " -m, --mount= use for unifyfs\n" " (default: /unifyfs)\n" " -u, --unmount unmount the filesystem after test\n" + " -r, --rank= only test on rank \n" "\n"; static char* program; @@ -167,6 +185,10 @@ int main(int argc, char** argv) unmount = 1; break; + case 'r': + testrank = atoi(optarg); + break; + case 'h': default: print_usage(); @@ -178,6 +200,11 @@ int main(int argc, char** argv) print_usage(); } + if (testrank > total_ranks - 1) { + test_print(0, "Please specify a valid rank number."); + print_usage(); + } + filename = argv[optind]; if (debug) { @@ -192,11 +219,21 @@ int main(int argc, char** argv) MPI_Barrier(MPI_COMM_WORLD); - ret = stat(filename, &sb); - if (ret < 0) { - test_print(rank, "stat failed on \"%s\"", filename); + if (testrank < 0) { /* execute from all ranks in order */ + int i = 0; + + for (i = 0; i < total_ranks; i++) { + if (rank == i) { + do_stat(rank); + } + + MPI_Barrier(MPI_COMM_WORLD); + } + } else { - dump_stat(rank, &sb); + if (rank == testrank) { + do_stat(rank); + } } MPI_Barrier(MPI_COMM_WORLD); diff --git a/examples/src/sysio-truncate.c b/examples/src/sysio-truncate.c new file mode 100644 index 000000000..782fc4d3d --- /dev/null +++ b/examples/src/sysio-truncate.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "testlib.h" + +static int rank; +static int total_ranks; +static int debug; + +static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ +static char* filename = "/unifyfs"; +static int unmount; /* unmount unifyfs after running the test */ +static int testrank; +static off_t targetlen; + +#define FP_SPECIAL 1 + +static void dump_stat(int rank, const struct stat* sb) +{ + printf("## [RANK %d] %s\n", rank, filename); + printf("File type: "); + + switch (sb->st_mode & S_IFMT) { + case S_IFREG: + printf("regular file\n"); + break; + case S_IFDIR: + printf("directory\n"); + break; + case S_IFCHR: + printf("character device\n"); + break; + case S_IFBLK: + printf("block device\n"); + break; + case S_IFLNK: + printf("symbolic (soft) link\n"); + break; + case S_IFIFO: + printf("FIFO or pipe\n"); + break; + case S_IFSOCK: + printf("socket\n"); + break; + default: + printf("unknown file type?\n"); + break; + } + + printf("Device containing i-node: major=%ld minor=%ld\n", + (long) major(sb->st_dev), (long) minor(sb->st_dev)); + + printf("I-node number: %ld\n", (long) sb->st_ino); + + printf("Mode: %lo\n", + (unsigned long) sb->st_mode); + + if (sb->st_mode & (S_ISUID | S_ISGID | S_ISVTX)) { + printf(" special bits set: %s%s%s\n", + (sb->st_mode & S_ISUID) ? "set-UID " : "", + (sb->st_mode & S_ISGID) ? "set-GID " : "", + (sb->st_mode & S_ISVTX) ? "sticky " : ""); + } + + printf("Number of (hard) links: %ld\n", (long) sb->st_nlink); + + printf("Ownership: UID=%ld GID=%ld\n", + (long) sb->st_uid, (long) sb->st_gid); + + if (S_ISCHR(sb->st_mode) || S_ISBLK(sb->st_mode)) { + printf("Device number (st_rdev): major=%ld; minor=%ld\n", + (long) major(sb->st_rdev), (long) minor(sb->st_rdev)); + } + + printf("File size: %lld bytes\n", (long long) sb->st_size); + printf("Optimal I/O block size: %ld bytes\n", (long) sb->st_blksize); + printf("512B blocks allocated: %lld\n", (long long) sb->st_blocks); + + printf("Last file access: %s", ctime(&sb->st_atime)); + printf("Last file modification: %s", ctime(&sb->st_mtime)); + printf("Last status change: %s\n\n", ctime(&sb->st_ctime)); +} + +static struct option long_opts[] = { + { "debug", 0, 0, 'd' }, + { "help", 0, 0, 'h' }, + { "length", 1, 0, 'l' }, + { "mount", 1, 0, 'm' }, + { "unmount", 0, 0, 'u' }, + { "rank", 1, 0, 'r' }, + { 0, 0, 0, 0}, +}; + +static char* short_opts = "dhl:m:ur:"; + +static const char* usage_str = + "\n" + "Usage: %s [options...] \n" + "\n" + "Available options:\n" + " -d, --debug pause before running test\n" + " (handy for attaching in debugger)\n" + " -h, --help help message\n" + " -l, --length= truncate the file to \n" + " -m, --mount= use for unifyfs\n" + " (default: /unifyfs)\n" + " -u, --unmount unmount the filesystem after test\n" + " -r, --rank= test on rank (default: 0)\n" + "\n"; + +static char* program; + +static void print_usage(void) +{ + test_print_once(rank, usage_str, program); + exit(0); +} + +int main(int argc, char** argv) +{ + int ret = 0; + int ch = 0; + int optidx = 0; + struct stat sb = { 0, }; + + program = basename(strdup(argv[0])); + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + while ((ch = getopt_long(argc, argv, + short_opts, long_opts, &optidx)) >= 0) { + switch (ch) { + case 'd': + debug = 1; + break; + + case 'l': + targetlen = strtoull(optarg, NULL, 0); + break; + + case 'm': + mountpoint = strdup(optarg); + break; + + case 'u': + unmount = 1; + break; + + case 'r': + testrank = atoi(optarg); + break; + + case 'h': + default: + print_usage(); + break; + } + } + + if (argc - optind != 1) { + print_usage(); + } + + if (testrank > total_ranks - 1) { + test_print(0, "Please specify a valid rank number."); + print_usage(); + } + + filename = argv[optind]; + + if (debug) { + test_pause(rank, "Attempting to mount"); + } + + ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + if (ret) { + test_print(rank, "unifyfs_mount failed (return = %d)", ret); + exit(-1); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == testrank) { + /* try stat the file before truncate */ + ret = stat(filename, &sb); + if (ret < 0) { + test_print(rank, "stat failed on \"%s\"", filename); + } else { + test_print(rank, "## stat before truncate to %llu\n", + (unsigned long long) targetlen); + dump_stat(rank, &sb); + } + + ret = truncate(filename, targetlen); + if (ret < 0) { + test_print(rank, "truncate failed on \"%s\": (errno=%d: %s)", + filename, errno, strerror(errno)); + } + + /* try stat the file again after truncate */ + ret = stat(filename, &sb); + if (ret < 0) { + test_print(rank, "stat failed on \"%s\"", filename); + } else { + test_print(rank, "## stat after truncate to %llu\n", + (unsigned long long) targetlen); + dump_stat(rank, &sb); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (unmount) { + unifyfs_unmount(); + } + + MPI_Finalize(); + + return ret; +} + diff --git a/examples/src/sysio-unlink.c b/examples/src/sysio-unlink.c new file mode 100644 index 000000000..4de9a7d4d --- /dev/null +++ b/examples/src/sysio-unlink.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "testlib.h" + +static int rank; +static int total_ranks; +static int debug; + +static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ +static char* filename = "/unifyfs"; +static int unmount; /* unmount unifyfs after running the test */ +static int testrank; + +static struct option long_opts[] = { + { "debug", 0, 0, 'd' }, + { "help", 0, 0, 'h' }, + { "mount", 1, 0, 'm' }, + { "unmount", 0, 0, 'u' }, + { "rank", 1, 0, 'r' }, + { 0, 0, 0, 0}, +}; + +static char* short_opts = "dhm:ur:"; + +static const char* usage_str = + "\n" + "Usage: %s [options...] \n" + "\n" + "Available options:\n" + " -d, --debug pause before running test\n" + " (handy for attaching in debugger)\n" + " -h, --help help message\n" + " -m, --mount= use for unifyfs\n" + " (default: /unifyfs)\n" + " -u, --unmount unmount the filesystem after test\n" + " -r, --rank= test on rank (default: 0)\n" + "\n"; + +static char* program; + +static void print_usage(void) +{ + test_print_once(rank, usage_str, program); + exit(0); +} + +int main(int argc, char** argv) +{ + int ret = 0; + int ch = 0; + int optidx = 0; + + program = basename(strdup(argv[0])); + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + while ((ch = getopt_long(argc, argv, + short_opts, long_opts, &optidx)) >= 0) { + switch (ch) { + case 'd': + debug = 1; + break; + + case 'm': + mountpoint = strdup(optarg); + break; + + case 'u': + unmount = 1; + break; + + case 'r': + testrank = atoi(optarg); + break; + + case 'h': + default: + print_usage(); + break; + } + } + + if (argc - optind != 1) { + print_usage(); + } + + if (testrank > total_ranks - 1) { + test_print(0, "Please specify a valid rank number."); + print_usage(); + } + + filename = argv[optind]; + + if (debug) { + test_pause(rank, "Attempting to mount"); + } + + ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + if (ret) { + test_print(rank, "unifyfs_mount failed (return = %d)", ret); + exit(-1); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == testrank) { + ret = unlink(filename); + if (ret < 0) { + test_print(rank, "unlink failed on \"%s\" (%s)", + filename, strerror(errno)); + } + } + +out: + MPI_Barrier(MPI_COMM_WORLD); + + if (unmount) { + unifyfs_unmount(); + } + + MPI_Finalize(); + + return ret; +} + diff --git a/examples/src/sysio-write.c b/examples/src/sysio-write.c index 2b5dd31b1..133e4a3eb 100644 --- a/examples/src/sysio-write.c +++ b/examples/src/sysio-write.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include @@ -331,10 +332,22 @@ int main(int argc, char** argv) sprintf(&targetfile[strlen(targetfile)], "-%d", rank); } - fd = open(targetfile, O_RDWR | O_CREAT | O_TRUNC, 0600); - if (fd < 0) { - test_print(rank, "open failed"); - exit(-1); + if (rank == 0) { + fd = open(targetfile, O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + test_print(rank, "open failed"); + exit(-1); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank != 0) { + fd = open(targetfile, O_RDWR, 0600); + if (fd < 0) { + test_print(rank, "open failed"); + exit(-1); + } } ret = do_write(); diff --git a/examples/src/sysio-writeread.c b/examples/src/sysio-writeread.c index 899cbab50..9285b0f02 100644 --- a/examples/src/sysio-writeread.c +++ b/examples/src/sysio-writeread.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -151,7 +151,7 @@ int main(int argc, char* argv[]) #ifndef DISABLE_UNIFYFS if (use_unifyfs) { ret = unifyfs_mount(mntpt, rank, num_rank, 0); - if (UNIFYFS_SUCCESS != ret) { + if (0 != ret) { MPI_Abort(MPI_COMM_WORLD, ret); } MPI_Barrier(MPI_COMM_WORLD); diff --git a/examples/src/sysio-writeread2.c b/examples/src/sysio-writeread2.c index cf03fce86..76cad03a5 100644 --- a/examples/src/sysio-writeread2.c +++ b/examples/src/sysio-writeread2.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. diff --git a/examples/src/testlib.h b/examples/src/testlib.h index 06c87c6ae..586cd6ab9 100644 --- a/examples/src/testlib.h +++ b/examples/src/testlib.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #ifndef __TESTLIB_H #define __TESTLIB_H diff --git a/examples/src/testutil.c b/examples/src/testutil.c index 50b21219b..bb3b40256 100644 --- a/examples/src/testutil.c +++ b/examples/src/testutil.c @@ -1,15 +1,19 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include /* device major() and minor() macros */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "testutil.h" /* Clone of the apsrintf(). See the standard asprintf() man page for details */ @@ -96,7 +100,9 @@ int dd_cmd(test_cfg* cfg, char* infile, char* outfile, unsigned long bs, return errno; } buf = calloc(sizeof(*buf), bs); - assert(buf); + if (NULL == buf) { + return errno; + } rc = fseek(outfp, seek * bs, SEEK_SET); if (rc) { @@ -270,13 +276,17 @@ int sync_cmd(test_cfg* cfg, char* filename) int stat_cmd(test_cfg* cfg, char* filename) { struct stat sb; + time_t timestamp; int rc; const char* typestr; char* tmp; + char* newline; + char datestr[32]; + errno = 0; rc = stat(filename, &sb); if (rc) { - test_print(cfg, "Error stating %s: %s", filename, strerror(rc)); + test_print(cfg, "ERROR: stat(%s) failed", filename); return rc; } @@ -312,10 +322,11 @@ int stat_cmd(test_cfg* cfg, char* filename) test_print(cfg, "%-26s%s", tmp, typestr); free(tmp); - test_print(cfg, "Device containing i-node: major=%ld minor=%ld", + test_print(cfg, "Device containing i-node: major=%ld minor=%ld", (long) major(sb.st_dev), (long) minor(sb.st_dev)); - test_print(cfg, "I-node number: %ld", (long) sb.st_ino); + test_print(cfg, "I-node number: %lu", + (unsigned long) sb.st_ino); test_print(cfg, "Mode: %lo", (unsigned long) sb.st_mode); @@ -327,7 +338,8 @@ int stat_cmd(test_cfg* cfg, char* filename) (sb.st_mode & S_ISVTX) ? "sticky " : ""); } - test_print(cfg, "Number of (hard) links: %ld", (long) sb.st_nlink); + test_print(cfg, "Number of (hard) links: %lu", + (unsigned long) sb.st_nlink); test_print(cfg, "Ownership: UID=%ld GID=%ld", (long) sb.st_uid, (long) sb.st_gid); @@ -337,12 +349,37 @@ int stat_cmd(test_cfg* cfg, char* filename) (long) major(sb.st_rdev), (long) minor(sb.st_rdev)); } - test_print(cfg, "File size: %lld bytes", - (long long) sb.st_size); - test_print(cfg, "Optimal I/O block size: %ld bytes", - (long) sb.st_blksize); - test_print(cfg, "Blocks allocated: %lld", (long long) sb.st_blocks); - test_print(cfg, "Last file access: %s", ctime(&sb.st_atime)); - test_print(cfg, "Last file modification: %s", ctime(&sb.st_mtime)); - test_print(cfg, "Last status change: %s", ctime(&sb.st_ctime)); + test_print(cfg, "File size: %llu bytes", + (unsigned long long) sb.st_size); + test_print(cfg, "Optimal I/O block size: %lu bytes", + (unsigned long) sb.st_blksize); + test_print(cfg, "Blocks allocated: %llu", + (unsigned long long) sb.st_blocks); + + memset(datestr, 0, sizeof(datestr)); + timestamp = sb.st_atime; + ctime_r(×tamp, datestr); + newline = strchr(datestr, '\n'); + if (NULL != newline) { + *newline = '\0'; + } + test_print(cfg, "Last file access: %s", datestr); + + memset(datestr, 0, sizeof(datestr)); + timestamp = sb.st_mtime; + ctime_r(×tamp, datestr); + newline = strchr(datestr, '\n'); + if (NULL != newline) { + *newline = '\0'; + } + test_print(cfg, "Last file modification: %s", datestr); + + memset(datestr, 0, sizeof(datestr)); + timestamp = sb.st_ctime; + ctime_r(×tamp, datestr); + newline = strchr(datestr, '\n'); + if (NULL != newline) { + *newline = '\0'; + } + test_print(cfg, "Last status change: %s", datestr); } diff --git a/examples/src/testutil.h b/examples/src/testutil.h index 7ee02ad89..d64b0c124 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include @@ -118,14 +120,20 @@ typedef struct { int verbose; /* print verbose information to stderr */ int use_mpi; int use_unifyfs; + int enable_mpi_mount; /* automount during MPI_Init() */ + char* output_file; /* print test messages to output file */ + FILE* output_fp; /* I/O behavior options */ int io_pattern; /* N1 or NN */ int io_check; /* use lipsum to verify data */ int io_shuffle; /* read and write different extents */ + int pre_wr_trunc; /* truncate file before writing */ + int post_wr_trunc; /* truncate file after writing */ int use_aio; /* use asynchronous IO */ int use_lio; /* use lio_listio instead of read/write */ int use_mapio; /* use mmap instead of read/write */ + int use_mpiio; /* use MPI-IO instead of POSIX I/O */ int use_prdwr; /* use pread/pwrite instead of read/write */ int use_stdio; /* use fread/fwrite instead of read/write */ int use_vecio; /* use readv/writev instead of read/write */ @@ -134,21 +142,25 @@ typedef struct { uint64_t n_blocks; /* number of I/O blocks */ uint64_t block_sz; /* IO block size (multiple of chunk_sz) */ uint64_t chunk_sz; /* per-IO-op size */ + off_t trunc_size; /* file size for truncate */ /* target file info */ char* filename; char* mountpt; FILE* fp; + int fd; + int fd_access; /* access flags for cfg.fd */ void* mapped; /* address of mapped extent of cfg.fd */ off_t mapped_off; /* start offset for mapped extent */ size_t mapped_sz; /* size of mapped extent */ - int fd; - int fd_access; /* access flags for cfg.fd */ + MPI_File mpifh; /* MPI file handle (when use_mpiio) */ /* MPI info */ - int app_id; int rank; int n_ranks; + + /* UnifyFS info */ + int app_id; } test_cfg; static inline @@ -168,6 +180,10 @@ void test_config_init(test_cfg* cfg) cfg->use_unifyfs = 1; cfg->io_pattern = IO_PATTERN_N1; +#if defined(ENABLE_MPI_MOUNT) + cfg->enable_mpi_mount = 1; +#endif + // invalidate file descriptor cfg->fd = -1; @@ -182,39 +198,51 @@ void test_config_print(test_cfg* cfg) { assert(NULL != cfg); - fprintf(stderr, " Test Configuration\n"); - fprintf(stderr, "==========================\n"); - - fprintf(stderr, "\n-- Program Behavior --\n"); - fprintf(stderr, "\t debug = %d\n", cfg->debug); - fprintf(stderr, "\t verbose = %d\n", cfg->verbose); - fprintf(stderr, "\t use_mpi = %d\n", cfg->use_mpi); - fprintf(stderr, "\t use_unifyfs = %d\n", cfg->use_unifyfs); - - fprintf(stderr, "\n-- IO Behavior --\n"); - fprintf(stderr, "\t io_pattern = %s\n", io_pattern_str(cfg->io_pattern)); - fprintf(stderr, "\t io_check = %d\n", cfg->io_check); - fprintf(stderr, "\t io_shuffle = %d\n", cfg->io_shuffle); - fprintf(stderr, "\t use_aio = %d\n", cfg->use_aio); - fprintf(stderr, "\t use_lio = %d\n", cfg->use_lio); - fprintf(stderr, "\t use_mapio = %d\n", cfg->use_mapio); - fprintf(stderr, "\t use_prdwr = %d\n", cfg->use_prdwr); - fprintf(stderr, "\t use_stdio = %d\n", cfg->use_stdio); - fprintf(stderr, "\t use_vecio = %d\n", cfg->use_vecio); - - fprintf(stderr, "\n-- IO Size Config --\n"); - fprintf(stderr, "\t n_blocks = %" PRIu64 "\n", cfg->n_blocks); - fprintf(stderr, "\t block_sz = %" PRIu64 "\n", cfg->block_sz); - fprintf(stderr, "\t chunk_sz = %" PRIu64 "\n", cfg->chunk_sz); - - fprintf(stderr, "\n-- Target File --\n"); - fprintf(stderr, "\t filename = %s\n", cfg->filename); - fprintf(stderr, "\t mountpt = %s\n", cfg->mountpt); - - fprintf(stderr, "\n-- MPI Info --\n"); - fprintf(stderr, "\t app_id = %d\n", cfg->app_id); - fprintf(stderr, "\t rank = %d\n", cfg->rank); - fprintf(stderr, "\t n_ranks = %d\n", cfg->n_ranks); + FILE* fp = cfg->output_fp; + if (NULL == fp) { + fp = stderr; + } + + fprintf(fp, " Test Configuration\n"); + fprintf(fp, "==========================\n"); + + fprintf(fp, "\n-- Program Behavior --\n"); + fprintf(fp, "\t debug = %d\n", cfg->debug); + fprintf(fp, "\t verbose = %d\n", cfg->verbose); + fprintf(fp, "\t use_mpi = %d\n", cfg->use_mpi); + fprintf(fp, "\t use_unifyfs = %d\n", cfg->use_unifyfs); + fprintf(fp, "\t mpi_mount = %d\n", cfg->enable_mpi_mount); + fprintf(fp, "\t outfile = %s\n", cfg->output_file); + + fprintf(fp, "\n-- IO Behavior --\n"); + fprintf(fp, "\t io_pattern = %s\n", io_pattern_str(cfg->io_pattern)); + fprintf(fp, "\t io_check = %d\n", cfg->io_check); + fprintf(fp, "\t io_shuffle = %d\n", cfg->io_shuffle); + fprintf(fp, "\t pre_trunc = %d\n", cfg->pre_wr_trunc); + fprintf(fp, "\t post_trunc = %d\n", cfg->post_wr_trunc); + fprintf(fp, "\t use_aio = %d\n", cfg->use_aio); + fprintf(fp, "\t use_lio = %d\n", cfg->use_lio); + fprintf(fp, "\t use_mapio = %d\n", cfg->use_mapio); + fprintf(fp, "\t use_mpiio = %d\n", cfg->use_mpiio); + fprintf(fp, "\t use_prdwr = %d\n", cfg->use_prdwr); + fprintf(fp, "\t use_stdio = %d\n", cfg->use_stdio); + fprintf(fp, "\t use_vecio = %d\n", cfg->use_vecio); + + fprintf(fp, "\n-- IO Size Config --\n"); + fprintf(fp, "\t n_blocks = %" PRIu64 "\n", cfg->n_blocks); + fprintf(fp, "\t block_sz = %" PRIu64 "\n", cfg->block_sz); + fprintf(fp, "\t chunk_sz = %" PRIu64 "\n", cfg->chunk_sz); + fprintf(fp, "\t truncate_sz = %lu\n", (unsigned long)cfg->trunc_size); + + fprintf(fp, "\n-- Target File --\n"); + fprintf(fp, "\t filename = %s\n", cfg->filename); + fprintf(fp, "\t mountpt = %s\n", cfg->mountpt); + + fprintf(fp, "\n-- MPI Info --\n"); + fprintf(fp, "\t app_id = %d\n", cfg->app_id); + fprintf(fp, "\t rank = %d\n", cfg->rank); + fprintf(fp, "\t n_ranks = %d\n", cfg->n_ranks); + fprintf(fp, "\n==========================\n\n"); } static inline @@ -240,28 +268,28 @@ static inline void test_print(test_cfg* cfg, const char* fmt, ...) { int err = errno; - char buf[1024]; assert(NULL != cfg); - printf("[%d] ", cfg->rank); + FILE* fp = cfg->output_fp; + if (NULL == fp) { + fp = stdout; + } + + fprintf(fp, "[%d] ", cfg->rank); va_list args; va_start(args, fmt); - vsprintf(buf, fmt, args); - printf("%s", buf); + vfprintf(fp, fmt, args); va_end(args); if (err) { - printf(" (errno=%d, %s)", err, strerror(err)); - } - - /* Add in a '\n' if the line didn't end with one */ - if (buf[strlen(buf) - 1] != '\n') { - printf("\n"); + fprintf(fp, " (errno=%d, %s)", err, strerror(err)); } - fflush(stdout); + /* End with a newline */ + fprintf(fp, "\n"); + fflush(fp); } static inline @@ -275,17 +303,23 @@ void test_print_once(test_cfg* cfg, const char* fmt, ...) return; } + FILE* fp = cfg->output_fp; + if (NULL == fp) { + fp = stdout; + } + va_list args; va_start(args, fmt); - vfprintf(stdout, fmt, args); + vfprintf(fp, fmt, args); va_end(args); if (err) { - printf(" (errno=%d, %s)\n", err, strerror(err)); + fprintf(fp, " (errno=%d, %s)\n", err, strerror(err)); } - printf("\n"); - fflush(stdout); + /* End with a newline */ + fprintf(fp, "\n"); + fflush(fp); } static inline @@ -297,13 +331,18 @@ void test_print_verbose(test_cfg* cfg, const char* fmt, ...) return; } + FILE* fp = cfg->output_fp; + if (NULL == fp) { + fp = stderr; + } + va_list args; va_start(args, fmt); - vfprintf(stderr, fmt, args); + vfprintf(fp, fmt, args); va_end(args); - fprintf(stderr, "\n"); - fflush(stderr); + fprintf(fp, "\n"); + fflush(fp); } static inline @@ -315,13 +354,18 @@ void test_print_verbose_once(test_cfg* cfg, const char* fmt, ...) return; } + FILE* fp = cfg->output_fp; + if (NULL == fp) { + fp = stderr; + } + va_list args; va_start(args, fmt); - vfprintf(stderr, fmt, args); + vfprintf(fp, fmt, args); va_end(args); - fprintf(stderr, "\n"); - fflush(stderr); + fprintf(fp, "\n"); + fflush(fp); } /* ---------- Timer Utilities ---------- */ @@ -329,8 +373,10 @@ void test_print_verbose_once(test_cfg* cfg, const char* fmt, ...) typedef struct { struct timeval start; struct timeval stop; + struct timeval stop_all; char* name; double elapsed_sec; + double elapsed_sec_all; } test_timer; static inline @@ -384,6 +430,39 @@ void timer_stop(test_timer* timer) &(timer->stop)); } +static inline +void timer_start_barrier(test_cfg* cfg, test_timer* timer) +{ + /* execute a barrier to ensure procs don't start + * next phase before all have reached this point */ + if (cfg->use_mpi) { + MPI_Barrier(MPI_COMM_WORLD); + } + + /* everyone has reached, start the timer, + * the start field is used in both local and global timers */ + timer_start(timer); +} + +static inline +void timer_stop_barrier(test_cfg* cfg, test_timer* timer) +{ + /* stop the local timer and compute elapsed_secs */ + timer_stop(timer); + + /* execute a barrier to ensure procs have reached this point + * before stopping the global timer */ + if (cfg->use_mpi) { + MPI_Barrier(MPI_COMM_WORLD); + } + + /* everyone has reached, stop the global timer and + * compute elapsed global time */ + gettimeofday(&(timer->stop_all), NULL); + timer->elapsed_sec_all = timediff_sec(&(timer->start), + &(timer->stop_all)); +} + /* ---------- Option Parsing Utilities ---------- */ static const char* unifyfs_mntpt = "/unifyfs"; @@ -412,7 +491,7 @@ int test_is_static(const char* program) // common options for all tests -static const char* test_short_opts = "a:Ab:c:df:hkLm:Mn:p:PSUvVx"; +static const char* test_short_opts = "a:Ab:c:df:hkLm:MNn:o:p:PSt:T:UvVx"; static const struct option test_long_opts[] = { { "appid", 1, 0, 'a' }, @@ -425,10 +504,14 @@ static const struct option test_long_opts[] = { { "check", 0, 0, 'k' }, { "listio", 0, 0, 'L' }, { "mount", 1, 0, 'm' }, - { "mapio", 0, 0, 'M' }, + { "mpiio", 0, 0, 'M' }, { "nblocks", 1, 0, 'n' }, + { "mapio", 0, 0, 'N' }, + { "outfile", 1, 0, 'o' }, { "pattern", 1, 0, 'p' }, { "prdwr", 0, 0, 'P' }, + { "pre-truncate", 1, 0, 't' }, + { "post-truncate", 1, 0, 'T' }, { "stdio", 0, 0, 'S' }, { "disable-unifyfs", 0, 0, 'U' }, { "verbose", 0, 0, 'v' }, @@ -460,16 +543,24 @@ static const char* test_usage_str = " (default: off)\n" " -m, --mount= use for unifyfs\n" " (default: /unifyfs)\n" - " -M, --mapio use mmap instead of read|write\n" + " -M, --mpiio use MPI-IO instead of POSIX I/O\n" " (default: off)\n" " -n, --nblocks= count of blocks each process will read|write\n" " (default: 32)\n" + " -N, --mapio use mmap instead of read|write\n" + " (default: off)\n" + " -o, --outfile= output file name (or path)\n" + " (default: 'stdout')\n" " -p, --pattern= 'n1' (N-to-1 shared file) or 'nn' (N-to-N file per process)\n" " (default: 'n1')\n" " -P, --prdwr use pread|pwrite instead of read|write\n" " (default: off)\n" " -S, --stdio use fread|fwrite instead of read|write\n" " (default: off)\n" + " -t, --pre-truncate= truncate file to size (B) before writing\n" + " (default: off)\n" + " -T, --post-truncate= truncate file to size (B) after writing\n" + " (default: off)\n" " -U, --disable-unifyfs do not use UnifyFS\n" " (default: enable UnifyFS)\n" " -v, --verbose print verbose information\n" @@ -538,13 +629,21 @@ int test_process_argv(test_cfg* cfg, break; case 'M': - cfg->use_mapio = 1; + cfg->use_mpiio = 1; break; case 'n': cfg->n_blocks = (uint64_t) strtoul(optarg, NULL, 0); break; + case 'N': + cfg->use_mapio = 1; + break; + + case 'o': + cfg->output_file = strdup(optarg); + break; + case 'p': cfg->io_pattern = check_io_pattern(optarg); break; @@ -557,6 +656,16 @@ int test_process_argv(test_cfg* cfg, cfg->use_stdio = 1; break; + case 't': + cfg->pre_wr_trunc = 1; + cfg->trunc_size = (off_t) strtoul(optarg, NULL, 0); + break; + + case 'T': + cfg->post_wr_trunc = 1; + cfg->trunc_size = (off_t) strtoul(optarg, NULL, 0); + break; + case 'U': cfg->use_unifyfs = 0; break; @@ -616,41 +725,63 @@ int test_process_argv(test_cfg* cfg, } // exhaustive check of incompatible I/O modes + if (cfg->pre_wr_trunc || cfg->post_wr_trunc) { + if (cfg->pre_wr_trunc && cfg->post_wr_trunc) { + test_print_once(cfg, + "USAGE ERROR: choose --pre-truncate or --post-truncate"); + exit(-1); + } + if (cfg->use_mapio || cfg->use_stdio) { + test_print_once(cfg, + "USAGE ERROR: pre/post-truncate incompatible with " + "[--mapio, --stdio]"); + exit(-1); + } + } if (cfg->use_aio && - (cfg->use_mapio || cfg->use_prdwr || cfg->use_stdio || cfg->use_vecio)) { - test_print_once(cfg, "USAGE ERROR: --aio incompatible with " - "[--mapio, --prdwr, --stdio, --vecio]"); + (cfg->use_mapio || cfg->use_mpiio || cfg->use_prdwr + || cfg->use_stdio || cfg->use_vecio)) { + test_print_once(cfg, + "USAGE ERROR: --aio incompatible with " + "[--mapio, --mpiio, --prdwr, --stdio, --vecio]"); exit(-1); } - if (cfg->use_lio && - (cfg->use_mapio || cfg->use_prdwr || cfg->use_stdio || cfg->use_vecio)) { - test_print_once(cfg, "USAGE ERROR: --listio incompatible with " - "[--mapio, --prdwr, --stdio, --vecio]"); + (cfg->use_mapio || cfg->use_mpiio || cfg->use_prdwr + || cfg->use_stdio || cfg->use_vecio)) { + test_print_once(cfg, + "USAGE ERROR: --listio incompatible with " + "[--mapio, --mpiio, --prdwr, --stdio, --vecio]"); exit(-1); } - if (cfg->use_mapio && - (cfg->use_prdwr || cfg->use_stdio || cfg->use_vecio)) { - test_print_once(cfg, "USAGE ERROR: --mapio incompatible with " - "[--aio, --listio, --prdwr, --stdio, --vecio]"); + (cfg->use_mpiio || cfg->use_prdwr || cfg->use_stdio + || cfg->use_vecio)) { + test_print_once(cfg, + "USAGE ERROR: --mapio incompatible with " + "[--aio, --listio, --mpiio, --prdwr, --stdio, --vecio]"); exit(-1); } - - if (cfg->use_prdwr && - (cfg->use_stdio || cfg->use_vecio)) { - test_print_once(cfg, "USAGE ERROR: --prdwr incompatible with " - "[--aio, --listio, --mapio, --stdio, --vecio]"); + if (cfg->use_mpiio && + (cfg->use_prdwr || cfg->use_stdio || cfg->use_vecio)) { + test_print_once(cfg, + "USAGE ERROR: --mpiio incompatible with " + "[--aio, --listio, --mpiio, --prdwr, --stdio, --vecio]"); + exit(-1); + } + if (cfg->use_prdwr && (cfg->use_stdio || cfg->use_vecio)) { + test_print_once(cfg, + "USAGE ERROR: --prdwr incompatible with " + "[--aio, --listio, --mapio, --mpiio, --stdio, --vecio]"); exit(-1); } - if (cfg->use_stdio && cfg->use_vecio) { - test_print_once(cfg, "USAGE ERROR: --stdio incompatible with " - "[--aio, --listio, --mapio, --prdwr, --vecio]"); + test_print_once(cfg, + "USAGE ERROR: --stdio incompatible with " + "[--aio, --listio, --mapio, --mpiio, --prdwr, --vecio]"); exit(-1); } - if (NULL == cfg->filename) { // set filename default cfg->filename = strdup("testfile"); @@ -697,13 +828,29 @@ int lipsum_check(const char* buf, uint64_t len, uint64_t offset, uint64_t i, val; uint64_t start = offset / sizeof(uint64_t); uint64_t count = len / sizeof(uint64_t); + uint64_t skip = 0; + uint64_t remain = 0; const uint64_t* ibuf = (uint64_t*) buf; + /* check if we have any extra bytes at the front and end */ + if (offset % sizeof(uint64_t)) { + skip = sizeof(uint64_t) - (offset % sizeof(uint64_t)); + remain = (len - skip) % sizeof(uint64_t); + + ibuf = (uint64_t*) &buf[skip]; + start++; + + if (skip + remain >= sizeof(uint64_t)) { + count--; + } + } + for (i = 0; i < count; i++) { val = start + i; if (ibuf[i] != val) { *error_offset = offset + (i * sizeof(uint64_t)); - fprintf(stderr, "DEBUG: [%" PRIu64 "] @ offset %" PRIu64 + fprintf(stderr, + "LIPSUM CHECK ERROR: [%" PRIu64 "] @ offset %" PRIu64 ", expected=%" PRIu64 " found=%" PRIu64 "\n", i, *error_offset, val, ibuf[i]); return -1; @@ -715,13 +862,37 @@ int lipsum_check(const char* buf, uint64_t len, uint64_t offset, /* ---------- MPI Utilities ---------- */ +/* MPI checker + * from: https://stackoverflow.com/questions/22859269/ + */ +#define MPI_CHECK(cfgp, fncall) \ + do { \ + mpi_error = 0; \ + int _merr = fncall; \ + if (_merr != MPI_SUCCESS) { \ + mpi_error = _merr; \ + handle_mpi_error(cfgp, #fncall); \ + } \ + } while (0) + +static int mpi_error; + +static inline +void handle_mpi_error(test_cfg* cfg, char* context) +{ + char errstr[MPI_MAX_ERROR_STRING]; + int len = 0; + MPI_Error_string(mpi_error, errstr, &len); + test_print(cfg, "MPI ERROR: %s returned %s", context, errstr); +} + static inline void test_barrier(test_cfg* cfg) { assert(NULL != cfg); if (cfg->use_mpi) { - MPI_Barrier(MPI_COMM_WORLD); + MPI_CHECK(cfg, (MPI_Barrier(MPI_COMM_WORLD))); } } @@ -733,8 +904,9 @@ double test_reduce_double_sum(test_cfg* cfg, double local_val) assert(NULL != cfg); if (cfg->use_mpi) { - MPI_Reduce(&local_val, &aggr_val, - 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_CHECK(cfg, (MPI_Reduce(&local_val, &aggr_val, + 1, MPI_DOUBLE, MPI_SUM, + 0, MPI_COMM_WORLD))); } else { aggr_val = local_val; } @@ -749,8 +921,9 @@ double test_reduce_double_max(test_cfg* cfg, double local_val) assert(NULL != cfg); if (cfg->use_mpi) { - MPI_Reduce(&local_val, &aggr_val, - 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_CHECK(cfg, (MPI_Reduce(&local_val, &aggr_val, + 1, MPI_DOUBLE, MPI_MAX, + 0, MPI_COMM_WORLD))); } else { aggr_val = local_val; } @@ -765,8 +938,9 @@ double test_reduce_double_min(test_cfg* cfg, double local_val) assert(NULL != cfg); if (cfg->use_mpi) { - MPI_Reduce(&local_val, &aggr_val, - 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_CHECK(cfg, (MPI_Reduce(&local_val, &aggr_val, + 1, MPI_DOUBLE, MPI_MIN, + 0, MPI_COMM_WORLD))); } else { aggr_val = local_val; } @@ -806,6 +980,21 @@ const char* test_access_to_stdio_mode(int access) return NULL; } +static int test_access_to_mpiio_mode(int access) +{ + switch (access) { + case O_RDWR: + return MPI_MODE_RDWR; + case O_RDONLY: + return MPI_MODE_RDONLY; + case O_WRONLY: + return MPI_MODE_WRONLY; + default: + break; + } + return 0; +} + /* * open the given file */ @@ -818,7 +1007,19 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) assert(NULL != cfg); - if (cfg->use_stdio) { + if (cfg->use_mpiio) { + int amode = test_access_to_mpiio_mode(access); + if (cfg->io_pattern == IO_PATTERN_N1) { + MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_WORLD, filepath, amode, + MPI_INFO_NULL, &cfg->mpifh))); + } else { + MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_SELF, filepath, amode, + MPI_INFO_NULL, &cfg->mpifh))); + } + if (mpi_error) { + return -1; + } + } else if (cfg->use_stdio) { fmode = test_access_to_stdio_mode(access); fp = fopen(filepath, fmode); if (NULL == fp) { @@ -826,16 +1027,15 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) return -1; } cfg->fp = fp; - return 0; - } - - fd = open(filepath, access); - if (-1 == fd) { - test_print(cfg, "ERROR: open(%s) failed", filepath); - return -1; + } else { + fd = open(filepath, access); + if (-1 == fd) { + test_print(cfg, "ERROR: open(%s) failed", filepath); + return -1; + } + cfg->fd = fd; + cfg->fd_access = access; } - cfg->fd = fd; - cfg->fd_access = access; return 0; } @@ -847,6 +1047,10 @@ int test_close_file(test_cfg* cfg) { assert(NULL != cfg); + if (cfg->use_mpiio) { + MPI_CHECK(cfg, (MPI_File_close(&cfg->mpifh))); + } + if (NULL != cfg->fp) { fclose(cfg->fp); } @@ -875,13 +1079,29 @@ int test_create_file(test_cfg* cfg, const char* filepath, int access) assert(NULL != cfg); - if (cfg->use_stdio) { - fmode = test_access_to_stdio_mode(access); + if (cfg->use_mpiio) { + create_mode = test_access_to_mpiio_mode(access); + create_mode |= MPI_MODE_CREATE; + if (cfg->io_pattern == IO_PATTERN_N1) { + MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_WORLD, filepath, create_mode, + MPI_INFO_NULL, &cfg->mpifh))); + } else { + create_mode |= MPI_MODE_EXCL; + MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_SELF, filepath, create_mode, + MPI_INFO_NULL, &cfg->mpifh))); + } + if (mpi_error) { + return -1; + } + return 0; } - // rank 0 creates or all ranks create if using file-per-process + /* POSIX I/O + * N-to-1 - rank 0 creates shared files + * N-to-N - all ranks create per-process files */ if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { if (cfg->use_stdio) { + fmode = test_access_to_stdio_mode(access); fp = fopen(filepath, fmode); if (NULL == fp) { test_print(cfg, "ERROR: fopen(%s) failed", filepath); @@ -1001,19 +1221,32 @@ int test_init(int argc, char** argv, } if (cfg->use_mpi) { - MPI_Init(&argc, &argv); + MPI_CHECK(cfg, (MPI_Init(&argc, &argv))); MPI_Comm_size(MPI_COMM_WORLD, &(cfg->n_ranks)); MPI_Comm_rank(MPI_COMM_WORLD, &(cfg->rank)); } else { cfg->n_ranks = 1; } + if (NULL != cfg->output_file) { + if (cfg->rank == 0) { + // only rank 0 writes to output file + cfg->output_fp = fopen(cfg->output_file, "a"); + if (NULL == cfg->output_fp) { + test_print_once(cfg, + "USAGE ERROR: failed to open output file %s", + cfg->output_file); + exit(-1); + } + } + } + if (cfg->verbose) { - // must come after test_mpi_init() to pick up MPI info + // must come after MPI_Init() to have valid MPI info test_config_print(cfg); } - if (cfg->use_unifyfs) { + if (cfg->use_unifyfs && !cfg->enable_mpi_mount) { #ifndef DISABLE_UNIFYFS if (cfg->debug) { test_pause(cfg, "Before unifyfs_mount()"); @@ -1022,13 +1255,14 @@ int test_init(int argc, char** argv, if (rc) { test_print(cfg, "ERROR: unifyfs_mount() failed (rc=%d)", rc); test_abort(cfg, rc); + return -1; } #endif test_barrier(cfg); - } else { - if (cfg->debug) { - test_pause(cfg, "Finished test initialization"); - } + } + + if (cfg->debug) { + test_pause(cfg, "Finished test initialization"); } return 0; @@ -1047,7 +1281,7 @@ void test_fini(test_cfg* cfg) test_close_file(cfg); - if (cfg->use_unifyfs) { + if (cfg->use_unifyfs && !cfg->enable_mpi_mount) { #ifndef DISABLE_UNIFYFS int rc = unifyfs_unmount(); if (rc) { @@ -1057,7 +1291,7 @@ void test_fini(test_cfg* cfg) } if (cfg->use_mpi) { - MPI_Finalize(); + MPI_CHECK(cfg, (MPI_Finalize())); } if (NULL != cfg->filename) { @@ -1068,6 +1302,14 @@ void test_fini(test_cfg* cfg) free(cfg->mountpt); } + if (NULL != cfg->output_file) { + free(cfg->output_file); + if (NULL != cfg->output_fp) { + fflush(cfg->output_fp); + fclose(cfg->output_fp); + } + } + memset(cfg, 0, sizeof(test_cfg)); } diff --git a/examples/src/testutil_rdwr.h b/examples/src/testutil_rdwr.h index 1ed21813f..2f5d5ec79 100644 --- a/examples/src/testutil_rdwr.h +++ b/examples/src/testutil_rdwr.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -15,6 +15,17 @@ #ifndef UNIFYFS_TESTUTIL_RDWR_H #define UNIFYFS_TESTUTIL_RDWR_H +#include "testutil.h" + + +static inline +void test_print_aiocb(test_cfg* cfg, struct aiocb* cbp) +{ + test_print(cfg, "aiocb(fd=%d, op=%d, count=%zu, offset=%zu, buf=%p)", + cbp->aio_fildes, cbp->aio_lio_opcode, cbp->aio_nbytes, + cbp->aio_offset, cbp->aio_buf); +} + /* -------- Write Helper Methods -------- */ static inline @@ -22,14 +33,16 @@ int issue_write_req(test_cfg* cfg, struct aiocb* req) { int rc, err; ssize_t ss; - size_t written, remaining; off_t off; void* src; assert(NULL != cfg); - errno = 0; + size_t written = 0; + size_t remaining = req->aio_nbytes; + if (cfg->use_aio) { // aio_write(2) + errno = 0; rc = aio_write(req); if (-1 == rc) { test_print(cfg, "aio_write() failed"); @@ -37,11 +50,17 @@ int issue_write_req(test_cfg* cfg, struct aiocb* req) return rc; } else if (cfg->use_mapio) { // mmap(2) return ENOTSUP; + } else if (cfg->use_mpiio) { // MPI-IO + MPI_Status mst; + MPI_Offset off = (MPI_Offset) req->aio_offset; + void* src_buf = (void*) req->aio_buf; + int count = (int) remaining; + MPI_CHECK(cfg, (MPI_File_write_at(cfg->mpifh, off, src_buf, + count, MPI_CHAR, &mst))); } else if (cfg->use_prdwr) { // pwrite(2) - written = 0; - remaining = req->aio_nbytes; do { src = (void*)((char*)req->aio_buf + written); + errno = 0; ss = pwrite(req->aio_fildes, src, remaining, (req->aio_offset + written)); if (-1 == ss) { @@ -60,8 +79,7 @@ int issue_write_req(test_cfg* cfg, struct aiocb* req) } else if (cfg->use_vecio) { // writev(2) return EINVAL; } else { // write(2) - written = 0; - remaining = req->aio_nbytes; + errno = 0; off = lseek(req->aio_fildes, req->aio_offset, SEEK_SET); if (-1 == off) { test_print(cfg, "lseek() failed"); @@ -69,6 +87,7 @@ int issue_write_req(test_cfg* cfg, struct aiocb* req) } do { src = (void*)((char*)req->aio_buf + written); + errno = 0; ss = write(req->aio_fildes, src, remaining); if (-1 == ss) { err = errno; @@ -97,6 +116,7 @@ int issue_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) struct aiocb* lio_vec[n_reqs]; for (i = 0; i < n_reqs; i++) { lio_vec[i] = reqs + i; + //test_print_aiocb(cfg, lio_vec[i]); } lio_mode = LIO_WAIT; if (cfg->use_aio) { @@ -177,6 +197,29 @@ int wait_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) return ret; } +static inline +int write_truncate(test_cfg* cfg) +{ + int rc = 0; + + if (cfg->use_mpiio) { + MPI_Offset mpi_off = (MPI_Offset) cfg->trunc_size; + MPI_CHECK(cfg, (MPI_File_set_size(cfg->mpifh, mpi_off))); + } else { + if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { + if (-1 != cfg->fd) { // ftruncate(2) + rc = ftruncate(cfg->fd, cfg->trunc_size); + if (-1 == rc) { + test_print(cfg, "ftruncate() failed"); + return -1; + } + } + } + } + + return rc; +} + static inline int write_sync(test_cfg* cfg) { @@ -202,10 +245,58 @@ int write_sync(test_cfg* cfg) test_print(cfg, "fsync() failed"); return -1; } + } else if (cfg->use_mpiio) { + MPI_CHECK(cfg, (MPI_File_sync(cfg->mpifh))); } return 0; } +static inline +int write_laminate(test_cfg* cfg, const char* filepath) +{ + /* need one process to laminate each file, + * we use the same process that created the file */ + int rc = 0; + if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { + /* laminate by setting permissions to read-only */ + int chmod_rc = chmod(filepath, 0444); + if (-1 == chmod_rc) { + /* lamination failed */ + test_print(cfg, "chmod() during lamination failed"); + rc = -1; + } + } + if (cfg->io_pattern == IO_PATTERN_N1) { + test_barrier(cfg); + if (cfg->rank != 0) { + /* call stat() to update global metadata */ + struct stat st; + int stat_rc = stat(filepath, &st); + if (-1 == stat_rc) { + /* lamination failed */ + test_print(cfg, "stat() update during lamination failed"); + rc = -1; + } + } + } + return rc; +} + +static inline +int stat_file(test_cfg* cfg, const char* filepath) +{ + int rc = 0; + if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { + struct stat s; + int stat_rc = stat(filepath, &s); + if (-1 == stat_rc) { + test_print(cfg, "ERROR - stat(%s) failed", filepath); + rc = -1; + } + } + return rc; +} + /* -------- Read Helper Methods -------- */ static inline @@ -219,8 +310,8 @@ int issue_read_req(test_cfg* cfg, struct aiocb* req) assert(NULL != cfg); - errno = 0; if (cfg->use_aio) { // aio_read(2) + errno = 0; rc = aio_read(req); if (-1 == rc) { test_print(cfg, "aio_read() failed"); @@ -228,11 +319,19 @@ int issue_read_req(test_cfg* cfg, struct aiocb* req) return rc; } else if (cfg->use_mapio) { // mmap(2) return ENOTSUP; + } else if (cfg->use_mpiio) { // MPI-IO + MPI_Status mst; + MPI_Offset off = (MPI_Offset) req->aio_offset; + void* dst_buf = (void*) req->aio_buf; + int count = (int) req->aio_nbytes; + MPI_CHECK(cfg, (MPI_File_read_at(cfg->mpifh, off, dst_buf, + count, MPI_CHAR, &mst))); } else if (cfg->use_prdwr) { // pread(2) nread = 0; remaining = req->aio_nbytes; do { dst = (void*)((char*)req->aio_buf + nread); + errno = 0; ss = pread(req->aio_fildes, dst, remaining, (req->aio_offset + nread)); if (-1 == ss) { @@ -256,6 +355,7 @@ int issue_read_req(test_cfg* cfg, struct aiocb* req) } else { // read(2) nread = 0; remaining = req->aio_nbytes; + errno = 0; off = lseek(req->aio_fildes, req->aio_offset, SEEK_SET); if (-1 == off) { test_print(cfg, "lseek() failed"); @@ -263,6 +363,7 @@ int issue_read_req(test_cfg* cfg, struct aiocb* req) } do { dst = (void*)((char*)req->aio_buf + nread); + errno = 0; ss = read(req->aio_fildes, dst, remaining); if (-1 == ss) { err = errno; diff --git a/examples/src/transfer.c b/examples/src/transfer.c index a01e36702..e31c297f0 100644 --- a/examples/src/transfer.c +++ b/examples/src/transfer.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,6 +11,7 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include #include diff --git a/examples/src/write.c b/examples/src/write.c index 5e8e794f2..e1f9743ae 100644 --- a/examples/src/write.c +++ b/examples/src/write.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -102,6 +102,8 @@ size_t generate_write_reqs(test_cfg* cfg, char* srcbuf, * cfg.use_mapio - support is not yet implemented. When enabled, * direct memory loads and stores will be used for writes. * + * cfg.use_mpiio - when enabled, MPI-IO will be used. + * * cfg.use_prdwr - when enabled, pwrite(2) will be used. * * cfg.use_stdio - when enabled, fwrite(2) will be used. @@ -125,11 +127,21 @@ int main(int argc, char* argv[]) test_cfg test_config; test_cfg* cfg = &test_config; + test_timer time_create2laminate; + test_timer time_create; test_timer time_wr; test_timer time_sync; + test_timer time_close; + test_timer time_laminate; + test_timer time_stat; + timer_init(&time_create2laminate, "create2laminate"); + timer_init(&time_create, "create"); timer_init(&time_wr, "write"); timer_init(&time_sync, "sync"); + timer_init(&time_close, "close"); + timer_init(&time_laminate, "laminate"); + timer_init(&time_stat, "stat"); rc = test_init(argc, argv, cfg); if (rc) { @@ -146,13 +158,20 @@ int main(int argc, char* argv[]) return -1; } + // timer to wrap all parts of write operation + timer_start_barrier(cfg, &time_create2laminate); + + // create file target_file = test_target_filename(cfg); test_print_verbose_once(cfg, "DEBUG: creating target file %s", target_file); + timer_start_barrier(cfg, &time_create); rc = test_create_file(cfg, target_file, O_RDWR); if (rc) { test_abort(cfg, rc); } + timer_stop_barrier(cfg, &time_create); + test_print_verbose_once(cfg, "DEBUG: finished create"); // generate write requests test_print_verbose_once(cfg, "DEBUG: generating write requests"); @@ -167,8 +186,7 @@ int main(int argc, char* argv[]) // do writes test_print_verbose_once(cfg, "DEBUG: starting write requests"); - test_barrier(cfg); - timer_start(&time_wr); + timer_start_barrier(cfg, &time_wr); rc = issue_write_req_batch(cfg, num_reqs, reqs); if (rc) { test_abort(cfg, rc); @@ -177,38 +195,46 @@ int main(int argc, char* argv[]) if (rc) { test_abort(cfg, rc); } - timer_stop(&time_wr); + timer_stop_barrier(cfg, &time_wr); test_print_verbose_once(cfg, "DEBUG: finished write requests"); - // sync/laminate - timer_start(&time_sync); + // sync + timer_start_barrier(cfg, &time_sync); rc = write_sync(cfg); if (rc) { test_abort(cfg, rc); } - timer_stop(&time_sync); - test_barrier(cfg); + timer_stop_barrier(cfg, &time_sync); test_print_verbose_once(cfg, "DEBUG: finished sync"); - if ((test_config.rank == 0) || - (IO_PATTERN_NN == test_config.io_pattern)) { - /* laminate by removing write bits */ - chmod(target_file, 0400); - } - test_print_verbose_once(cfg, "DEBUG: finished lamination"); - // post-write cleanup free(wr_buf); free(reqs); reqs = NULL; // close file + timer_start_barrier(cfg, &time_close); rc = test_close_file(cfg); if (rc) { test_abort(cfg, rc); } + timer_stop_barrier(cfg, &time_close); + test_print_verbose_once(cfg, "DEBUG: finished close"); + + // laminate + timer_start_barrier(cfg, &time_laminate); + rc = write_laminate(cfg, target_file); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_laminate); + test_print_verbose_once(cfg, "DEBUG: finished laminate"); + + // timer to wrap all parts of write operation + timer_stop_barrier(cfg, &time_create2laminate); // file size check + timer_start_barrier(cfg, &time_stat); size_t rank_bytes = test_config.n_blocks * test_config.block_sz; size_t total_bytes = rank_bytes * test_config.n_ranks; size_t expected = total_bytes; @@ -229,6 +255,8 @@ int main(int argc, char* argv[]) } } } + timer_stop_barrier(cfg, &time_stat); + test_print_verbose_once(cfg, "DEBUG: finished stat"); // calculate achieved bandwidth rates double max_write_time, max_sync_time; @@ -249,8 +277,15 @@ int main(int argc, char* argv[]) "Number of processes: %d\n" "Each process wrote: %.2lf MiB\n" "Total data written: %.2lf MiB\n" + "File create time: %.6lf sec\n" "Maximum write time: %.6lf sec\n" + "Global write time: %.6lf sec\n" "Maximum sync time: %.6lf sec\n" + "Global sync time: %.6lf sec\n" + "File close time: %.6lf sec\n" + "File laminate time: %.6lf sec\n" + "Full write time: %.6lf sec\n" + "File stat time: %.6lf sec\n" "Aggregate write bandwidth: %.3lf MiB/s\n" "Effective write bandwidth: %.3lf MiB/s\n", io_pattern_str(test_config.io_pattern), @@ -259,16 +294,28 @@ int main(int argc, char* argv[]) test_config.n_ranks, bytes_to_mib(rank_bytes), bytes_to_mib(total_bytes), + time_create.elapsed_sec_all, max_write_time, + time_wr.elapsed_sec_all, max_sync_time, + time_sync.elapsed_sec_all, + time_close.elapsed_sec_all, + time_laminate.elapsed_sec_all, + time_create2laminate.elapsed_sec_all, + time_stat.elapsed_sec_all, aggr_write_bw, eff_write_bw); // cleanup free(target_file); + timer_fini(&time_create2laminate); + timer_fini(&time_create); timer_fini(&time_wr); timer_fini(&time_sync); + timer_fini(&time_close); + timer_fini(&time_laminate); + timer_fini(&time_stat); test_fini(cfg); diff --git a/examples/src/writeread.c b/examples/src/writeread.c index c104a5ee9..0522435ca 100644 --- a/examples/src/writeread.c +++ b/examples/src/writeread.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -155,6 +155,8 @@ size_t generate_read_reqs(test_cfg* cfg, char* dstbuf, * cfg.use_mapio - support is not yet implemented. When enabled, * direct memory loads and stores will be used for reads and writes. * + * cfg.use_mpiio - when enabled, MPI-IO will be used. + * * cfg.use_prdwr - when enabled, pread(2) and pwrite(2) will be used. * * cfg.use_stdio - when enabled, fread(2) and fwrite(2) will be used. @@ -164,6 +166,12 @@ size_t generate_read_reqs(test_cfg* cfg, char* dstbuf, * * cfg.io_check - when enabled, lipsum data is used when writing * the file and verified when reading. + * + * cfg.pre_wr_truncate - when enabled, truncate the file to specified + * size before writing. + * + * cfg.post_wr_truncate - when enabled, truncate the file to specified + * size after writing. */ int main(int argc, char* argv[]) @@ -171,19 +179,31 @@ int main(int argc, char* argv[]) char* wr_buf; char* rd_buf; char* target_file; - struct aiocb* reqs; + struct aiocb* reqs = NULL; size_t num_reqs = 0; int rc; test_cfg test_config; test_cfg* cfg = &test_config; + test_timer time_create2laminate; + test_timer time_create; test_timer time_wr; test_timer time_rd; test_timer time_sync; + test_timer time_stat_pre; + test_timer time_stat_pre2; + test_timer time_laminate; + test_timer time_stat_post; + timer_init(&time_create2laminate, "create2laminate"); + timer_init(&time_create, "create"); timer_init(&time_wr, "write"); timer_init(&time_rd, "read"); timer_init(&time_sync, "sync"); + timer_init(&time_stat_pre, "statpre"); + timer_init(&time_stat_pre2, "statpre2"); + timer_init(&time_laminate, "laminate"); + timer_init(&time_stat_post, "statpost"); rc = test_init(argc, argv, cfg); if (rc) { @@ -200,13 +220,24 @@ int main(int argc, char* argv[]) return -1; } + // timer to wrap all parts of write operation + timer_start_barrier(cfg, &time_create2laminate); + + // create file target_file = test_target_filename(cfg); test_print_verbose_once(cfg, "DEBUG: creating target file %s", target_file); + timer_start_barrier(cfg, &time_create); rc = test_create_file(cfg, target_file, O_RDWR); if (rc) { test_abort(cfg, rc); } + timer_stop_barrier(cfg, &time_create); + test_print_verbose_once(cfg, "DEBUG: finished create"); + + if (cfg->pre_wr_trunc) { + write_truncate(cfg); + } // generate write requests test_print_verbose_once(cfg, "DEBUG: generating write requests"); @@ -221,8 +252,7 @@ int main(int argc, char* argv[]) // do writes test_print_verbose_once(cfg, "DEBUG: starting write requests"); - test_barrier(cfg); - timer_start(&time_wr); + timer_start_barrier(cfg, &time_wr); rc = issue_write_req_batch(cfg, num_reqs, reqs); if (rc) { test_abort(cfg, rc); @@ -231,19 +261,49 @@ int main(int argc, char* argv[]) if (rc) { test_abort(cfg, rc); } - timer_stop(&time_wr); + timer_stop_barrier(cfg, &time_wr); test_print_verbose_once(cfg, "DEBUG: finished write requests"); - // sync/laminate - timer_start(&time_sync); + // sync + timer_start_barrier(cfg, &time_sync); rc = write_sync(cfg); if (rc) { test_abort(cfg, rc); } - timer_stop(&time_sync); - test_barrier(cfg); + timer_stop_barrier(cfg, &time_sync); test_print_verbose_once(cfg, "DEBUG: finished sync"); + // stat file pre-laminate + timer_start_barrier(cfg, &time_stat_pre); + stat_file(cfg, target_file); + timer_stop_barrier(cfg, &time_stat_pre); + test_print_verbose_once(cfg, "DEBUG: finished stat pre-laminate"); + + if (cfg->post_wr_trunc) { + write_truncate(cfg); + + // stat file post-truncate + timer_start_barrier(cfg, &time_stat_pre2); + stat_file(cfg, target_file); + timer_stop_barrier(cfg, &time_stat_pre2); + test_print_verbose_once(cfg, "DEBUG: finished stat pre2 (post trunc)"); + } + + // laminate + timer_start_barrier(cfg, &time_laminate); + rc = write_laminate(cfg, target_file); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_laminate); + test_print_verbose_once(cfg, "DEBUG: finished laminate"); + + // stat file post-laminate + timer_start_barrier(cfg, &time_stat_post); + stat_cmd(cfg, target_file); + timer_stop_barrier(cfg, &time_stat_post); + test_print_verbose_once(cfg, "DEBUG: finished stat post-laminate"); + // post-write cleanup free(wr_buf); free(reqs); @@ -262,8 +322,7 @@ int main(int argc, char* argv[]) // do reads test_print_verbose_once(cfg, "DEBUG: starting read requests"); - test_barrier(cfg); - timer_start(&time_rd); + timer_start_barrier(cfg, &time_rd); rc = issue_read_req_batch(cfg, num_reqs, reqs); if (rc) { test_abort(cfg, rc); @@ -272,8 +331,7 @@ int main(int argc, char* argv[]) if (rc) { test_abort(cfg, rc); } - timer_stop(&time_rd); - test_barrier(cfg); + timer_stop_barrier(cfg, &time_rd); test_print_verbose_once(cfg, "DEBUG: finished read requests"); if (test_config.io_check) { @@ -288,46 +346,96 @@ int main(int argc, char* argv[]) // calculate achieved bandwidth rates size_t rank_bytes = test_config.n_blocks * test_config.block_sz; - double write_bw, read_bw; - double aggr_write_bw, aggr_read_bw; - double max_write_time, max_read_time; - double min_sync_time, max_sync_time; - - write_bw = bandwidth_mib(rank_bytes, time_wr.elapsed_sec); - aggr_write_bw = test_reduce_double_sum(cfg, write_bw); - max_write_time = test_reduce_double_max(cfg, time_wr.elapsed_sec); - - min_sync_time = test_reduce_double_min(cfg, time_sync.elapsed_sec); - max_sync_time = test_reduce_double_max(cfg, time_sync.elapsed_sec); - - read_bw = bandwidth_mib(rank_bytes, time_rd.elapsed_sec); - aggr_read_bw = test_reduce_double_sum(cfg, read_bw); - max_read_time = test_reduce_double_max(cfg, time_rd.elapsed_sec); + size_t total_bytes = rank_bytes * test_config.n_ranks; + + double min_local_write_time = test_reduce_double_min(cfg, + time_wr.elapsed_sec); + double max_local_write_time = test_reduce_double_max(cfg, + time_wr.elapsed_sec); + double max_global_write_time = test_reduce_double_max(cfg, + time_wr.elapsed_sec_all); + + double local_write_bw = bandwidth_mib(rank_bytes, time_wr.elapsed_sec); + double min_local_write_bw = test_reduce_double_min(cfg, local_write_bw); + double max_local_write_bw = test_reduce_double_max(cfg, local_write_bw); + double aggr_local_write_bw = test_reduce_double_sum(cfg, local_write_bw); + + double global_write_bw = bandwidth_mib(total_bytes, max_global_write_time); + + double min_local_sync_time = test_reduce_double_min(cfg, + time_sync.elapsed_sec); + double max_local_sync_time = test_reduce_double_max(cfg, + time_sync.elapsed_sec); + double max_global_sync_time = test_reduce_double_max(cfg, + time_sync.elapsed_sec_all); + + double global_write_sync_bw = bandwidth_mib(total_bytes, + max_global_write_time + max_global_sync_time); + + double min_local_read_time = test_reduce_double_min(cfg, + time_rd.elapsed_sec); + double max_local_read_time = test_reduce_double_max(cfg, + time_rd.elapsed_sec); + double max_global_read_time = test_reduce_double_max(cfg, + time_rd.elapsed_sec_all); + + double local_read_bw = bandwidth_mib(rank_bytes, time_rd.elapsed_sec); + double min_local_read_bw = test_reduce_double_min(cfg, local_read_bw); + double max_local_read_bw = test_reduce_double_max(cfg, local_read_bw); + double aggr_local_read_bw = test_reduce_double_sum(cfg, local_read_bw); + + double global_read_bw = bandwidth_mib(total_bytes, max_global_read_time); if (test_config.rank == 0) { - size_t total_bytes = rank_bytes * test_config.n_ranks; - double eff_write_bw, eff_read_bw; - eff_write_bw = bandwidth_mib(total_bytes, max_write_time); - eff_read_bw = bandwidth_mib(total_bytes, max_read_time); - - printf("Aggregate Write BW is %.3lf MiB/s\n" - "Effective Write BW is %.3lf MiB/s\n\n", - aggr_write_bw, eff_write_bw); - printf("Minimum Sync Time is %.6lf s\n" - "Maximum Sync Time is %.6lf s\n\n", - min_sync_time, max_sync_time); - printf("Aggregate Read BW is %.3lf MiB/s\n" - "Effective Read BW is %.3lf MiB/s\n\n", - aggr_read_bw, eff_read_bw); - fflush(stdout); + errno = 0; /* just in case there was an earlier error */ + test_print_once(cfg, "File Create Time is %.6lf s", + time_create.elapsed_sec_all); + test_print_once(cfg, "Minimum Local Write BW is %.3lf MiB/s", + min_local_write_bw); + test_print_once(cfg, "Maximum Local Write BW is %.3lf MiB/s", + max_local_write_bw); + test_print_once(cfg, "Aggregate Local Write BW is %.3lf MiB/s", + aggr_local_write_bw); + test_print_once(cfg, "Global Write BW is %.3lf MiB/s", + global_write_bw); + test_print_once(cfg, "Minimum Local Sync Time is %.6lf s", + min_local_sync_time); + test_print_once(cfg, "Maximum Local Sync Time is %.6lf s", + max_local_sync_time); + test_print_once(cfg, "Global Sync Time is %.6lf s", + max_global_sync_time); + test_print_once(cfg, "Global Write+Sync BW is %.3lf MiB/s", + global_write_sync_bw); + test_print_once(cfg, "Stat Time Pre-Laminate is %.6lf s", + time_stat_pre.elapsed_sec_all); + test_print_once(cfg, "Stat Time Pre-Laminate2 is %.6lf s", + time_stat_pre.elapsed_sec_all); + test_print_once(cfg, "File Laminate Time is %.6lf s", + time_laminate.elapsed_sec_all); + test_print_once(cfg, "Stat Time Post-Laminate is %.6lf s", + time_stat_post.elapsed_sec_all); + test_print_once(cfg, "Minimum Local Read BW is %.3lf MiB/s", + min_local_read_bw); + test_print_once(cfg, "Maximum Local Read BW is %.3lf MiB/s", + max_local_read_bw); + test_print_once(cfg, "Aggregate Local Read BW is %.3lf MiB/s", + aggr_local_read_bw); + test_print_once(cfg, "Global Read BW is %.3lf MiB/s", + global_read_bw); } // cleanup free(target_file); + timer_fini(&time_create2laminate); + timer_fini(&time_create); timer_fini(&time_wr); timer_fini(&time_rd); timer_fini(&time_sync); + timer_fini(&time_stat_pre); + timer_fini(&time_stat_pre2); + timer_fini(&time_laminate); + timer_fini(&time_stat_post); test_fini(cfg); diff --git a/examples/src/writeread.f90 b/examples/src/writeread.f90 index 3c98fa6aa..7ecae3092 100644 --- a/examples/src/writeread.f90 +++ b/examples/src/writeread.f90 @@ -1,4 +1,16 @@ - program write_read_F +! Copyright (c) 2020, Lawrence Livermore National Security, LLC. +! Produced at the Lawrence Livermore National Laboratory. +! +! Copyright 2020, UT-Battelle, LLC. +! +! LLNL-CODE-741539 +! All rights reserved. +! +! This is the license for UnifyFS. +! For details, see https://github.com/LLNL/UnifyFS. +! Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + +program write_read_F implicit none @@ -43,7 +55,7 @@ program write_read_F writeunit = mynod open(unit=writeunit,file=fname,form='unformatted',action='write') - + write(writeunit,iostat=ios) W1 close(writeunit) diff --git a/extras/unifyfs.conf.in b/extras/unifyfs.conf.in index 365c7194e..20402e027 100644 --- a/extras/unifyfs.conf.in +++ b/extras/unifyfs.conf.in @@ -1,6 +1,9 @@ # unifyfs.conf -# NOTE: settings with default values are commented out +# NOTE: +# - settings with default values are commented out +# - string values should not be quoted, e.g., /var/tmp is correct but +# "/var/tmp" is not. # # COMMENT STYLE: # '#' start of line comment character @@ -8,9 +11,9 @@ # SECTION: top-level configuration [unifyfs] -# consistency = "LAMINATED" ; NONE | LAMINATED | POSIX -# daemonize = on ; servers will become daemons -# mountpoint = "/unifyfs" ; mountpoint (i.e., prefix path) +# consistency = LAMINATED ; NONE | LAMINATED | POSIX +# daemonize = on ; servers will become daemons +# mountpoint = /unifyfs ; mountpoint (i.e., prefix path) # SECTION: client settings [client] @@ -24,8 +27,8 @@ verbosity = 5 ; logging verbosity level [0-5] (default: 0) # SECTION: metadata settings [meta] -# db_name = "unifyfs_metadb" ; metadata datbase name -db_path = "/var/tmp" ; metadata database directory path (default: /tmp) +# db_name = unifyfs_metadb ; metadata datbase name +db_path = /var/tmp ; metadata database directory path (default: /tmp) # SECTION: shared memory segment settings [shmem] @@ -34,7 +37,7 @@ chunk_mem = 67108864 ; segment size for data chunks (default: 256 MiB) # SECTION: spillover local to each node [spillover] -# enabled = on ; enable spillover to local storage -# data_dir = "/mnt/ssd" ; directory path for data spillover -# meta_dir = "/mnt/ssd" ; directory path for metadata spillover -size = 268435456 ; data spillover max size (default: 1 GiB) +# enabled = on ; enable spillover to local storage +# data_dir = /mnt/ssd ; directory path for data spillover +# meta_dir = /mnt/ssd ; directory path for metadata spillover +size = 268435456 ; data spillover max size (default: 1 GiB) diff --git a/m4/check_numa.m4 b/m4/check_numa.m4 deleted file mode 100644 index a7cb5dee1..000000000 --- a/m4/check_numa.m4 +++ /dev/null @@ -1,91 +0,0 @@ -dnl @synopsis CHECK_NUMA() -dnl -dnl This macro searches for an installed numa library. If nothing was -dnl specified when calling configure, it searches first in /usr/local -dnl and then in /usr. If the --with-numa=DIR is specified, it will try -dnl to find it in DIR/include/numa.h and DIR/lib/libnuma.a. If -dnl --without-numa is specified, the library is not searched at all. -dnl -dnl If either the header file (numa.h) or the library (libnuma) is not -dnl found, the configuration exits on error, asking for a valid numa -dnl installation directory or --without-numa. -dnl -dnl The macro defines the symbol HAVE_LIBNUMA if the library is found. You -dnl should use autoheader to include a definition for this symbol in a -dnl config.h file. Sample usage in a C/C++ source is as follows: -dnl -dnl #ifdef HAVE_LIBNUMA -dnl #include -dnl #endif /* HAVE_LIBNUMA */ -dnl -dnl @category InstalledPackages -dnl @author Loic Dachary -dnl @version 2004-09-20 -dnl @license GPLWithACException - -AC_DEFUN([CHECK_NUMA], [ - -# -# Handle user hints -# -LOOK_FOR_NUMA="no" -AC_MSG_CHECKING(if numa is wanted ) -AC_ARG_WITH([numa], - [AS_HELP_STRING([--with-numa=DIR],[root directory path of libnuma installation (defaults to /usr/local or /usr if not found in /usr/local)])], - [if test "$withval" != "no" ; then - AC_MSG_RESULT(yes) - LOOK_FOR_NUMA="yes" - if test "$withval" != "yes" ; then - # - # given a path to look in - # - NUMA_HOME="$withval" - fi - else - AC_MSG_RESULT(no) - fi], - [AC_MSG_RESULT(no)] -) - -# -# Locate numa, if wanted -# -if test "$LOOK_FOR_NUMA" = "yes" ; then - # - # determine where to look for libnuma - # - if test -n "${NUMA_HOME}" - then - AC_MSG_NOTICE([include libnuma from ${NUMA_HOME}]) - - # - # Look for NUMA where user tells us it is first - # - CFLAGS="-I${NUMA_HOME}/include $CFLAGS" - LDFLAGS="-L${NUMA_HOME}/lib $LDFLAGS" - else - AC_MSG_NOTICE([checking for libnuma installation in default locations]) - fi - - # - # Locate numa - # - AC_LANG_SAVE - AC_LANG_C - AC_CHECK_HEADER(numa.h, [numa_cv_numa_h=yes], [numa_cv_numa_h=no]) - AC_CHECK_LIB(numa, numa_num_possible_nodes, [numa_cv_libnuma=yes], [numa_cv_libnuma=no]) - AC_LANG_RESTORE - - # - # Determine whether we found it - # - if test "$numa_cv_libnuma" != "yes" -o "$numa_cv_numa_h" != "yes" - then - # - # If either header or library was not found, bomb - # - AC_MSG_ERROR(either specify a valid numa installation with --with-numa=DIR or disable numa usage with --without-numa) - fi -fi - -]) diff --git a/m4/flatcc.m4 b/m4/flatcc.m4 deleted file mode 100644 index f4d7f6658..000000000 --- a/m4/flatcc.m4 +++ /dev/null @@ -1,27 +0,0 @@ -AC_DEFUN([UNIFYFS_AC_FLATCC], [ - # preserve state of flags - FLATCC_OLD_CFLAGS=$CFLAGS - FLATCC_OLD_LDFLAGS=$LDFLAGS - - AC_ARG_WITH([flatcc], [AC_HELP_STRING([--with-flatcc=PATH], - [path to installed libflatcc [default=/usr/local]])], [ - FLATCC_CFLAGS="-I${withval}/include" - FLATCC_LDFLAGS="-L${withval}/lib" - CFLAGS="$CFLAGS ${FLATCC_CFLAGS}" - LDFLAGS="$LDFLAGS ${FLATCC_LDFLAGS}" - ], []) - - AC_CHECK_LIB([flatcc], [flatcc_create_context], - [FLATCC_LIBS="-lflatcc" - AC_SUBST(FLATCC_CFLAGS) - AC_SUBST(FLATCC_LDFLAGS) - AC_SUBST(FLATCC_LIBS) - ], - [AC_MSG_ERROR([couldn't find a suitable libflatcc, use --with-flatcc=PATH])], - [] - ) - - # restore flags - CFLAGS=$FLATCC_OLD_CFLAGS - LDFLAGS=$FLATCC_OLD_LDFLAGS -]) diff --git a/m4/gotcha.m4 b/m4/gotcha.m4 index 6d32ac5f3..54e09a5db 100644 --- a/m4/gotcha.m4 +++ b/m4/gotcha.m4 @@ -7,17 +7,21 @@ AC_DEFUN([UNIFYFS_AC_GOTCHA], [ AC_ARG_WITH([gotcha], [AC_HELP_STRING([--with-gotcha=PATH], [path to installed libgotcha [default=/usr/local]])], [ GOTCHA_CFLAGS="-I${withval}/include" - GOTCHA_LDFLAGS="-L${withval}/lib64" + GOTCHA_LDFLAGS="-L${withval}/lib64 -L${withval}/lib" CFLAGS="$CFLAGS ${GOTCHA_CFLAGS}" CXXFLAGS="$CXXFLAGS ${GOTCHA_CFLAGS}" LDFLAGS="$LDFLAGS ${GOTCHA_LDFLAGS}" ], []) AC_CHECK_LIB([gotcha], [gotcha_wrap], - [AC_SUBST(GOTCHA_CFLAGS) - AC_SUBST(GOTCHA_LDFLAGS) + [ + AC_SUBST(GOTCHA_CFLAGS) + AC_SUBST(GOTCHA_LDFLAGS) + AM_CONDITIONAL([HAVE_GOTCHA], [true]) + ],[ + AC_MSG_WARN([couldn't find a suitable libgotcha, use --with-gotcha=PATH]) + AM_CONDITIONAL([HAVE_GOTCHA], [false]) ], - [AC_MSG_ERROR([couldn't find a suitable libgotcha, use --with-gotcha=PATH])], [] ) diff --git a/m4/leveldb.m4 b/m4/leveldb.m4 index 2eed55189..b4f02daa5 100644 --- a/m4/leveldb.m4 +++ b/m4/leveldb.m4 @@ -3,13 +3,15 @@ AC_DEFUN([UNIFYFS_AC_LEVELDB], [ LEVELDB_OLD_CFLAGS=$CFLAGS LEVELDB_OLD_LDFLAGS=$LDFLAGS - AC_ARG_WITH([leveldb], [AC_HELP_STRING([--with-leveldb=PATH], - [path to installed libleveldb [default=/usr/local]])], [ - LEVELDB_CFLAGS="-I${withval}/include" - LEVELDB_LDFLAGS="-L${withval}/lib -L${withval}/lib64" + AC_ARG_VAR([LEVELDB_ROOT], [Set the path to the LevelDB installation Directory]) + + AS_IF([test -n "$LEVELDB_ROOT"],[ + AC_MSG_NOTICE([LEVELDB_ROOT is set, checking for LevelDB in $LEVELDB_ROOT]) + LEVELDB_CFLAGS="-I${LEVELDB_ROOT}/include" + LEVELDB_LDFLAGS="-L${LEVELDB_ROOT}/lib -L${LEVELDB_ROOT}/lib64" CFLAGS="$CFLAGS ${LEVELDB_CFLAGS}" LDFLAGS="$LDFLAGS ${LEVELDB_LDFLAGS}" - ], []) + ],[]) AC_CHECK_LIB([leveldb], [leveldb_open], [LEVELDB_LIBS="-lleveldb" @@ -17,7 +19,7 @@ AC_DEFUN([UNIFYFS_AC_LEVELDB], [ AC_SUBST(LEVELDB_LDFLAGS) AC_SUBST(LEVELDB_LIBS) ], - [AC_MSG_ERROR([couldn't find a suitable libleveldb, use --with-leveldb=PATH])], + [AC_MSG_ERROR([couldn't find a suitable libleveldb, use LEVELDB_ROOT to set the path to the installation directory.])], [] ) diff --git a/m4/openssl.m4 b/m4/openssl.m4 new file mode 100644 index 000000000..c40ac5cd2 --- /dev/null +++ b/m4/openssl.m4 @@ -0,0 +1,20 @@ +AC_DEFUN([UNIFYFS_AC_OPENSSL], [ + # preserve state of flags + OPENSSL_OLD_CFLAGS=$CFLAGS + OPENSSL_OLD_CXXFLAGS=$CXXFLAGS + OPENSSL_OLD_LDFLAGS=$LDFLAGS + + PKG_CHECK_MODULES([OPENSSL],[openssl], + [ + AC_SUBST(OPENSSL_CFLAGS) + AC_SUBST(OPENSSL_LIBS) + ], + [AC_MSG_ERROR(m4_normalize([ + couldn't find a suitable openssl-devel + ]))]) + + # restore flags + CFLAGS=$OPENSSL_OLD_CFLAGS + CXXFLAGS=$OPENSSL_OLD_CXXFLAGS + LDFLAGS=$OPENSSL_OLD_LDFLAGS +]) diff --git a/m4/spath.m4 b/m4/spath.m4 new file mode 100644 index 000000000..183a4d54b --- /dev/null +++ b/m4/spath.m4 @@ -0,0 +1,11 @@ +AC_DEFUN([UNIFYFS_AC_SPATH], [ + AC_CHECK_LIB([spath], [spath_strdup_reduce_str], + [ + LIBS="$LIBS -lspath" + AC_DEFINE([HAVE_SPATH], [1], [Defined if you have spath]) + ],[ + AC_MSG_WARN([couldn't find a suitable libspath]) + ], + [] + ) +]) diff --git a/meta/src/Makefile.am b/meta/src/Makefile.am index 4f4539ff1..dd094febe 100644 --- a/meta/src/Makefile.am +++ b/meta/src/Makefile.am @@ -1,4 +1,6 @@ +if USE_MDHIM noinst_LIBRARIES = libmdhim.a +endif libmdhim_a_SOURCES = Mlog2/mlog2.c \ Mlog2/mlog2.h \ @@ -36,6 +38,6 @@ AM_CPPFLAGS = -I$(top_srcdir)/meta/src/Mlog2 \ -I$(top_srcdir)/server/src AM_CFLAGS = -DLEVELDB_SUPPORT $(LEVELDB_CFLAGS) $(MPI_CFLAGS) $(MARGO_CFLAGS) -AM_CFLAGS += -Wall +AM_CFLAGS += -Wall -Werror CLEANFILES = diff --git a/meta/src/Mlog2/mlog2.c b/meta/src/Mlog2/mlog2.c index 10ea11692..cd56aa9f4 100644 --- a/meta/src/Mlog2/mlog2.c +++ b/meta/src/Mlog2/mlog2.c @@ -1174,6 +1174,8 @@ void mlog_setmasks(char *mstr, int mlen0) /* process priority */ if (prilen > 5) { /* we know it can't be longer than this */ prino = -1; + } else if (prilen < 0) { /* This if() block gets rid of a */ + prino = -1; /* compiler warning. */ } else { memset(pbuf, 0, sizeof(pbuf)); strncpy(pbuf, pri, prilen); diff --git a/meta/src/ds_leveldb.c b/meta/src/ds_leveldb.c index b28453dc0..602b38ca3 100644 --- a/meta/src/ds_leveldb.c +++ b/meta/src/ds_leveldb.c @@ -397,6 +397,7 @@ int mdhim_leveldb_put(void *dbh, void *key, int key_len, void *data, int32_t dat gettimeofday(&start, NULL); options = mdhimdb->write_options; leveldb_put(mdhimdb->db, options, key, key_len, data, data_len, &err); + gettimeofday(&end, NULL); /* * temporarily mute the error message until the file metadata * operation is fully defined and implemented */ diff --git a/meta/src/ds_leveldb.h b/meta/src/ds_leveldb.h index 124aeaca1..befef0c23 100644 --- a/meta/src/ds_leveldb.h +++ b/meta/src/ds_leveldb.h @@ -47,7 +47,7 @@ #include "partitioner.h" #include "data_store.h" -#include "unifyfs_metadata.h" +#include "unifyfs_metadata_mdhim.h" /* Function pointer for comparator in C */ typedef int (*mdhim_store_cmp_fn_t)(void* arg, const char* a, size_t alen, diff --git a/meta/src/indexes.c b/meta/src/indexes.c index 359179d9a..0721a3cf8 100644 --- a/meta/src/indexes.c +++ b/meta/src/indexes.c @@ -287,8 +287,8 @@ int update_stat(struct mdhim_t *md, struct index_t *index, void *key, uint32_t k *(unsigned long *)val1 = get_byte_num(key, key_len); *(unsigned long *)val2 = *(unsigned long *)val1; } else if (index->key_type == MDHIM_UNIFYFS_KEY) { - val1 = get_meta_pair(key, key_len); - val2 = get_meta_pair(key, key_len); + val1 = copy_unifyfs_key(key, key_len); + val2 = copy_unifyfs_key(key, key_len); } gettimeofday(&metaend, NULL); metatime+=1000000*(metaend.tv_sec-metastart.tv_sec)+metaend.tv_usec-metastart.tv_usec; diff --git a/meta/src/mdhim.c b/meta/src/mdhim.c index d89cd9544..a48e0ee07 100644 --- a/meta/src/mdhim.c +++ b/meta/src/mdhim.c @@ -108,8 +108,8 @@ struct mdhim_t *mdhimInit(void *appComm, struct mdhim_options_t *opts) { } //Quit if MPI didn't initialize with multiple threads if (provided != MPI_THREAD_MULTIPLE) { - mlog(MDHIM_CLIENT_CRIT, "MDHIM - Error while initializing MPI with threads"); - exit(1); + mlog(MDHIM_CLIENT_WARN, "MDHIM - Error while initializing MPI with threads"); + //exit(1); } } diff --git a/meta/src/mdhim_private.c b/meta/src/mdhim_private.c index dc499c5ef..399ab0e0f 100644 --- a/meta/src/mdhim_private.c +++ b/meta/src/mdhim_private.c @@ -357,7 +357,6 @@ struct mdhim_bgetrm_t *_bget_records(struct mdhim_t *md, struct index_t *index, if ((op == MDHIM_GET_EQ || op == MDHIM_GET_PRIMARY_EQ || op == MDHIM_RANGE_BGET) && index->type != LOCAL_INDEX && (rl = get_range_servers(md, index, keys[i], key_lens[i])) == NULL) { - printf("here\n"); fflush(stdout); mlog(MDHIM_CLIENT_CRIT, "MDHIM Rank: %d - " "Error while determining range server in mdhimBget", md->mdhim_rank); diff --git a/meta/src/partitioner.c b/meta/src/partitioner.c index 37c171bcb..6f6013b80 100644 --- a/meta/src/partitioner.c +++ b/meta/src/partitioner.c @@ -34,12 +34,15 @@ * */ +#include #include #include #include #include #include "partitioner.h" +#include "unifyfs_metadata_mdhim.h" + struct timeval calslicestart, calsliceend; double calslicetime = 0; struct timeval rangehashstart, rangehashend; @@ -85,13 +88,12 @@ long double get_str_num(void *key, uint32_t key_len) { return str_num; } -unsigned long * get_meta_pair(void *key, uint32_t key_len) { - ulong *meta_pair = (ulong *)malloc (2*sizeof(ulong)); - memset(meta_pair, 0, 2*sizeof(unsigned long)); - meta_pair[0] = *((unsigned long *)(((char *)key))); - meta_pair[1] = *((unsigned long *)(((char *)key)+sizeof(unsigned long))); - // printf("meta_pair[1] is %ld\n", meta_pair[1]); - return meta_pair; +/* Allocate a copy of a key and return it. The returned key must be freed. */ +void* copy_unifyfs_key(void* key, uint32_t key_len) +{ + void* key_copy = malloc((size_t)key_len); + memcpy(key_copy, key, (size_t)key_len); + return key_copy; } uint64_t get_byte_num(void *key, uint32_t key_len) { @@ -386,26 +388,19 @@ int get_slice_num(struct mdhim_t *md, struct index_t *index, void *key, int key_ key_num = floorl(map_num * total_keys); break; + case MDHIM_UNIFYFS_KEY: + /* Use only the gfid portion of the key, which ensures all extents + * for the same file hash to the same server */ + key_num = (uint64_t) UNIFYFS_KEY_FID(key); + break; default: return 0; - break; } /* Convert the key to a slice number */ slice_num = key_num/index->mdhim_max_recs_per_slice; - if (key_type == MDHIM_UNIFYFS_KEY) { - unsigned long *meta_pair = get_meta_pair(key, key_len); - unsigned long surplus = meta_pair[1]; - unsigned long highval = (meta_pair[0] << 1); - unsigned long multiply = (unsigned long)1 << (sizeof(unsigned long)*8 - 1); -/* slice_num = (surplus + highval * multiply%index->mdhim_max_recs_per_slice) % \ - index->mdhim_max_recs_per_slice; -*/ - slice_num = highval * (multiply/index->mdhim_max_recs_per_slice) + surplus/index->mdhim_max_recs_per_slice; - free(meta_pair); - } //Return the slice number return slice_num; } diff --git a/meta/src/partitioner.h b/meta/src/partitioner.h index 9707af159..330d65528 100644 --- a/meta/src/partitioner.h +++ b/meta/src/partitioner.h @@ -102,12 +102,14 @@ long double get_str_num(void *key, uint32_t key_len); uint64_t get_byte_num(void *key, uint32_t key_len); int get_slice_num(struct mdhim_t *md, struct index_t *index, void *key, int key_len); int is_float_key(int type); -unsigned long * get_meta_pair(void *key, uint32_t key_len); + rangesrv_list *get_range_servers_from_stats(struct mdhim_t *md, struct index_t *index, void *key, int key_len, int op); rangesrv_list *get_range_servers_from_range(struct mdhim_t *md, struct index_t *index, void *start_key, void *end_key, int key_len); +void* copy_unifyfs_key(void* key, uint32_t key_len); + #ifdef __cplusplus } #endif diff --git a/meta/src/range_server.c b/meta/src/range_server.c index 75739dd09..460f1f3c3 100644 --- a/meta/src/range_server.c +++ b/meta/src/range_server.c @@ -50,7 +50,7 @@ #include "mdhim_options.h" #include "partitioner.h" #include "range_server.h" -#include "unifyfs_metadata.h" +#include "unifyfs_metadata_mdhim.h" #include "uthash.h" int recv_counter = 0; diff --git a/scripts/linux_kernel_checkpatch/checkpatch.pl b/scripts/linux_kernel_checkpatch/checkpatch.pl index 49af5338b..2db3b969b 100755 --- a/scripts/linux_kernel_checkpatch/checkpatch.pl +++ b/scripts/linux_kernel_checkpatch/checkpatch.pl @@ -3514,7 +3514,7 @@ sub process { ($s !~ /^\s*(?:\}|\{|else\b)/)) || ($sindent > $indent + 8))) { WARN("SUSPECT_CODE_INDENT", - "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n"); + "suspect (wrong # of spaces?) code indent for cond. stmts ($indent, $sindent)\n" . $herecurr . "$stat_real\n"); } } diff --git a/server/src/Makefile.am b/server/src/Makefile.am index 89338dbf6..47f5116f3 100644 --- a/server/src/Makefile.am +++ b/server/src/Makefile.am @@ -1,53 +1,91 @@ +bin_PROGRAMS = unifyfsd + noinst_LIBRARIES = libunifyfsd.a +unifyfsd_SOURCES = unifyfs_server.c + libunifyfsd_a_SOURCES = \ - arraylist.c \ - arraylist.h \ + extent_tree.c \ + extent_tree.h \ margo_server.c \ margo_server.h \ unifyfs_cmd_handler.c \ + unifyfs_fops.h \ unifyfs_global.h \ - unifyfs_metadata.c \ - unifyfs_metadata.h \ + unifyfs_group_rpc.h \ + unifyfs_group_rpc.c \ + unifyfs_inode.h \ + unifyfs_inode.c \ + unifyfs_inode_tree.h \ + unifyfs_inode_tree.c \ + unifyfs_metadata_mdhim.h \ + unifyfs_p2p_rpc.h \ + unifyfs_p2p_rpc.c \ unifyfs_request_manager.c \ unifyfs_request_manager.h \ unifyfs_service_manager.c \ unifyfs_service_manager.h \ - unifyfs_sock.c \ - unifyfs_sock.h + unifyfs_server_pid.c \ + unifyfs_tree.c \ + unifyfs_tree.h -bin_PROGRAMS = unifyfsd +OPT_CPP_FLAGS = +OPT_C_FLAGS = +OPT_LD_FLAGS = +OPT_LIBS = + +if USE_MDHIM + + libunifyfsd_a_SOURCES += \ + unifyfs_metadata_mdhim.c \ + unifyfs_fops_mdhim.c + + OPT_CPP_FLAGS += \ + -DUSE_MDHIM \ + -I$(top_srcdir)/meta/src \ + -I$(top_srcdir)/meta/src/uthash \ + -I$(top_srcdir)/meta/src/Mlog2 + + OPT_C_FLAGS += \ + $(LEVELDB_CFLAGS) \ + $(MPI_CFLAGS) + + OPT_LD_FLAGS += \ + $(LEVELDB_LDFLAGS) \ + $(MPI_CLDFLAGS) + + OPT_LIBS += \ + $(top_builddir)/meta/src/libmdhim.a \ + $(LEVELDB_LIBS) + +else # ! USE_MDHIM + + libunifyfsd_a_SOURCES += \ + unifyfs_fops_rpc.c + +endif # USE_MDHIM -unifyfsd_SOURCES = unifyfs_init.c unifyfsd_LDFLAGS = -static \ - $(LEVELDB_LDFLAGS) \ - $(MARGO_LDFLAGS) \ - $(FLATCC_LDFLAGS) + $(OPT_LD_FLAGS) \ + $(MARGO_LDFLAGS) unifyfsd_LDADD = \ libunifyfsd.a \ $(top_builddir)/common/src/libunifyfs_common.la \ - $(top_builddir)/meta/src/libmdhim.a \ - $(LEVELDB_LIBS) \ - $(MPI_CLDFLAGS) \ + $(OPT_LIBS) \ $(MARGO_LIBS) \ - $(FLATCC_LIBS) \ -lpthread -lm -lstdc++ -lrt AM_CPPFLAGS = \ + $(OPT_CPP_FLAGS) \ -I$(top_srcdir)/common/src \ - -I$(top_srcdir)/client/src \ - -I$(top_srcdir)/meta/src \ - -I$(top_srcdir)/meta/src/uthash \ - -I$(top_srcdir)/meta/src/Mlog2 - -AM_CFLAGS = -Wall \ - $(LEVELDB_CFLAGS) \ - $(MPI_CFLAGS) \ - $(MERCURY_CFLAGS) \ - $(ARGOBOTS_CFLAGS) \ + -I$(top_srcdir)/client/src + +AM_CFLAGS = -Wall -Werror \ + $(OPT_C_FLAGS) \ $(MARGO_CFLAGS) \ - $(FLATCC_CFLAGS) + $(MERCURY_CFLAGS) \ + $(ARGOBOTS_CFLAGS) CLEANFILES = $(bin_PROGRAMS) diff --git a/server/src/extent_tree.c b/server/src/extent_tree.c new file mode 100644 index 000000000..c6e05d8bf --- /dev/null +++ b/server/src/extent_tree.c @@ -0,0 +1,705 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* + * This file is a simple, thread-safe, segment tree implementation. The + * segments in the tree are non-overlapping. Added segments overwrite the old + * segments in the tree. This is used to coalesce writes before an fsync. + */ + +#include +#include +#include +#include +#include + +#include "extent_tree.h" +#include "tree.h" +#include "unifyfs_metadata_mdhim.h" + +#undef MIN +#define MIN(a, b) (a < b ? a : b) +#undef MAX +#define MAX(a, b) (a > b ? a : b) + +int compare_func( + struct extent_tree_node* node1, + struct extent_tree_node* node2) +{ + if (node1->start > node2->end) { + return 1; + } else if (node1->end < node2->start) { + return -1; + } else { + return 0; + } +} + +RB_PROTOTYPE(inttree, extent_tree_node, entry, compare_func) +RB_GENERATE(inttree, extent_tree_node, entry, compare_func) + +/* Returns 0 on success, positive non-zero error code otherwise */ +int extent_tree_init(struct extent_tree* extent_tree) +{ + memset(extent_tree, 0, sizeof(*extent_tree)); + pthread_rwlock_init(&extent_tree->rwlock, NULL); + RB_INIT(&extent_tree->head); + return 0; +} + +/* + * Remove and free all nodes in the extent_tree. + */ +void extent_tree_destroy(struct extent_tree* extent_tree) +{ + extent_tree_clear(extent_tree); + pthread_rwlock_destroy(&extent_tree->rwlock); +} + +/* Allocate a node for the range tree. Free node with free() when finished */ +static struct extent_tree_node* extent_tree_node_alloc( + unsigned long start, /* logical starting offset of extent */ + unsigned long end, /* logical ending offset of extent */ + int svr_rank, /* rank of server hosting data */ + int app_id, /* application id (namespace) on server rank */ + int cli_id, /* client rank on server rank */ + unsigned long pos) /* physical offset of data in log */ +{ + /* allocate a new node structure */ + struct extent_tree_node* node = calloc(1, sizeof(*node)); + if (!node) { + return NULL; + } + + /* record logical range and physical offset */ + node->start = start; + node->end = end; + node->svr_rank = svr_rank; + node->app_id = app_id; + node->cli_id = cli_id; + node->pos = pos; + + return node; +} + +/* + * Given two start/end ranges, return a new range from start1/end1 that + * does not overlap start2/end2. The non-overlapping range is stored + * in new_start/new_end. If there are no non-overlapping ranges, + * return 1 from this function, else return 0. If there are two + * non-overlapping ranges, return the first one in new_start/new_end. + */ +static int get_non_overlapping_range( + unsigned long start1, unsigned long end1, + long start2, long end2, + long* new_start, long* new_end) +{ + /* this function is only called when we know that segment 1 + * and segment 2 overlap with each other, find first portion + * of segment 1 that does not overlap with segment 2, if any */ + if (start1 < start2) { + /* Segment 1 inlcudes a portion before segment 2 starts + * return start/end of that leading portion of segment 1 + * + * s1-------e1 + * s2--------e2 + * ---- non-overlap + */ + *new_start = start1; + *new_end = start2 - 1; + return 0; + } else if (end1 > end2) { + /* Segment 1 does not start before segment 2, + * but segment 1 extends past end of segment 2 + * return start/end of trailing portion of segment 1 + * + * s1-----e1 + * s2-------e2 + * --- non-overlap + */ + *new_start = end2 + 1; + *new_end = end1; + return 0; + } + + /* Segment 2 completely envelops segment 1 + * nothing left of segment 1 to return + * so return 1 to indicate this case + * + * s1-------e1 + * s2-------------e2 + */ + return 1; +} + +/* + * Add an entry to the range tree. Returns 0 on success, nonzero otherwise. + */ +int extent_tree_add( + struct extent_tree* extent_tree, /* tree to add new extent item */ + unsigned long start, /* logical starting offset of extent */ + unsigned long end, /* logical ending offset of extent */ + int svr_rank, /* rank of server hosting data */ + int app_id, /* application id (namespace) on server rank */ + int cli_id, /* client rank on server rank */ + unsigned long pos) /* physical offset of data in log */ +{ + /* assume we'll succeed */ + int rc = 0; + + /* Create node to define our new range */ + struct extent_tree_node* node = extent_tree_node_alloc( + start, end, svr_rank, app_id, cli_id, pos); + if (!node) { + return ENOMEM; + } + + /* lock the tree so we can modify it */ + extent_tree_wrlock(extent_tree); + + /* Try to insert our range into the RB tree. If it overlaps with any other + * range, then it is not inserted, and the overlapping range node is + * returned in 'overlap'. If 'overlap' is NULL, then there were no + * overlaps, and our range was successfully inserted. */ + struct extent_tree_node* overlap; + while ((overlap = RB_INSERT(inttree, &extent_tree->head, node))) { + /* Our range overlaps with another range (in 'overlap'). Is there any + * any part of 'overlap' that does not overlap our range? If so, + * delete the old 'overlap' and insert the smaller, non-overlapping + * range. */ + long new_start = 0; + long new_end = 0; + int ret = get_non_overlapping_range(overlap->start, overlap->end, + start, end, &new_start, &new_end); + if (ret) { + /* The new range we are adding completely covers the existing + * range in the tree defined in overlap. + * We can't find a non-overlapping range. + * Delete the existing range. */ + RB_REMOVE(inttree, &extent_tree->head, overlap); + free(overlap); + extent_tree->count--; + } else { + /* Part of the old range was non-overlapping. Split the old range + * into two ranges: one for the non-overlapping section, and one for + * the remaining section. The non-overlapping section gets + * inserted without issue. The remaining section will be processed + * on the next pass of this while() loop. */ + struct extent_tree_node* resized = extent_tree_node_alloc( + new_start, new_end, + overlap->svr_rank, overlap->app_id, overlap->cli_id, + overlap->pos + (new_start - overlap->start)); + if (!resized) { + /* failed to allocate memory for range node, + * bail out and release lock without further + * changing state of extent tree */ + free(node); + rc = ENOMEM; + goto release_add; + } + + /* if the non-overlapping part came from the front + * portion of the existing range, then there is a + * trailing portion of the existing range to add back + * to be considered again in the next loop iteration */ + struct extent_tree_node* remaining = NULL; + if (resized->end < overlap->end) { + /* There's still a remaining section after the non-overlapping + * part. Add it in. */ + remaining = extent_tree_node_alloc( + resized->end + 1, overlap->end, + overlap->svr_rank, overlap->app_id, overlap->cli_id, + overlap->pos + (resized->end + 1 - overlap->start)); + if (!remaining) { + /* failed to allocate memory for range node, + * bail out and release lock without further + * changing state of extent tree */ + free(node); + free(resized); + rc = ENOMEM; + goto release_add; + } + } + + /* Remove our old range and release it */ + RB_REMOVE(inttree, &extent_tree->head, overlap); + free(overlap); + extent_tree->count--; + + /* Insert the non-overlapping part of the new range */ + RB_INSERT(inttree, &extent_tree->head, resized); + extent_tree->count++; + + /* if we have a trailing portion, insert range for that, + * and increase our extent count since we just turned one + * range entry into two */ + if (remaining != NULL) { + RB_INSERT(inttree, &extent_tree->head, remaining); + extent_tree->count++; + } + } + } + + /* increment segment count in the tree for the + * new range we just added */ + extent_tree->count++; + + /* update max ending offset if end of new range + * we just inserted is larger */ + extent_tree->max = MAX(extent_tree->max, end); + + /* get temporary pointer to the node we just added */ + struct extent_tree_node* target = node; + + /* check whether we can coalesce new extent with any preceding extent */ + struct extent_tree_node* prev = RB_PREV( + inttree, &extent_tree->head, target); + if (prev != NULL && prev->end + 1 == target->start) { + /* found a extent that ends just before the new extent starts, + * check whether they are also contiguous in the log */ + unsigned long pos_end = prev->pos + (prev->end - prev->start + 1); + if (prev->svr_rank == target->svr_rank && + prev->cli_id == target->cli_id && + prev->app_id == target->app_id && + pos_end == target->pos) { + /* the preceding extent describes a log position adjacent to + * the extent we just added, so we can merge them, + * append entry to previous by extending end of previous */ + prev->end = target->end; + + /* delete new extent from the tree and free it */ + RB_REMOVE(inttree, &extent_tree->head, target); + free(target); + extent_tree->count--; + + /* update target to point at previous extent since we just + * merged our new extent into it */ + target = prev; + } + } + + /* check whether we can coalesce new extent with any trailing extent */ + struct extent_tree_node* next = RB_NEXT( + inttree, &extent_tree->head, target); + if (next != NULL && target->end + 1 == next->start) { + /* found a extent that starts just after the new extent ends, + * check whether they are also contiguous in the log */ + unsigned long pos_end = target->pos + (target->end - target->start + 1); + if (target->svr_rank == next->svr_rank && + target->cli_id == next->cli_id && + target->app_id == next->app_id && + pos_end == next->pos) { + /* the target extent describes a log position adjacent to + * the next extent, so we can merge them, + * append entry to target by extending end of to cover next */ + target->end = next->end; + + /* delete next extent from the tree and free it */ + RB_REMOVE(inttree, &extent_tree->head, next); + free(next); + extent_tree->count--; + } + } + +release_add: + + /* done modifying the tree */ + extent_tree_unlock(extent_tree); + + return rc; +} + +/* search tree for entry that overlaps with given start/end + * offsets, return first overlapping entry if found, NULL otherwise, + * assumes caller has lock on tree */ +struct extent_tree_node* extent_tree_find( + struct extent_tree* extent_tree, /* tree to search */ + unsigned long start, /* starting offset to search */ + unsigned long end) /* ending offset to search */ +{ + /* Create a range of just our starting byte offset */ + struct extent_tree_node* node = extent_tree_node_alloc( + start, start, 0, 0, 0, 0); + if (!node) { + return NULL; + } + + /* search tree for either a range that overlaps with + * the target range (starting byte), or otherwise the + * node for the next biggest starting byte */ + struct extent_tree_node* next = RB_NFIND( + inttree, &extent_tree->head, node); + + free(node); + + /* we may have found a node that doesn't include our starting + * byte offset, but it would be the range with the lowest + * starting offset after the target starting offset, check whether + * this overlaps our end offset */ + if (next && next->start <= end) { + return next; + } + + /* otherwise, there is not element that overlaps with the + * target range of [start, end] */ + return NULL; +} + +/* truncate extents to use new maximum, discards extent entries + * that exceed the new truncated size, and rewrites any entry + * that overlaps */ +int extent_tree_truncate( + struct extent_tree* tree, /* tree to truncate */ + unsigned long size) /* size to truncate extents to */ +{ + if (0 == size) { + extent_tree_clear(tree); + return 0; + } + + /* lock the tree for reading */ + extent_tree_wrlock(tree); + + /* lookup node with the extent that has the maximum offset */ + struct extent_tree_node* node = RB_MAX(inttree, &tree->head); + + /* iterate backwards until we find an extent below + * the truncated size */ + while (node != NULL && node->end >= size) { + /* found an extent whose ending offset is equal to or + * extends beyond the truncated size, + * check whether the full extent is beyond the truncated + * size or whether the new size falls within this extent */ + if (node->start >= size) { + /* the start offset is also beyond the truncated size, + * meaning the entire range is beyond the truncated size, + * get pointer to next previous extent in tree */ + struct extent_tree_node* oldnode = node; + node = RB_PREV(inttree, &tree->head, node); + + /* remove this node from the tree and release it */ + LOGDBG("removing node [%lu, %lu] due to truncate=%lu", + node->start, node->end, size); + RB_REMOVE(inttree, &tree->head, oldnode); + free(oldnode); + + /* decrement the number of extents in the tree */ + tree->count--; + } else { + /* the range of this node overlaps with the truncated size + * so just update its end to be the new size */ + node->end = size - 1; + break; + } + } + + /* update maximum offset in tree */ + if (node != NULL) { + /* got at least one extent left, update maximum field */ + tree->max = node->end; + } else { + /* no extents left in the tree, set max back to 0 */ + tree->max = 0; + } + + /* done reading the tree */ + extent_tree_unlock(tree); + + return 0; +} + +/* + * Given a range tree and a starting node, iterate though all the nodes + * in the tree, returning the next one each time. If start is NULL, then + * start with the first node in the tree. + * + * This is meant to be called in a loop, like: + * + * extent_tree_rdlock(extent_tree); + * + * struct extent_tree_node *node = NULL; + * while ((node = extent_tree_iter(extent_tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * extent_tree_unlock(extent_tree); + * + * Note: this function does no locking, and assumes you're properly locking + * and unlocking the extent_tree before doing the iteration (see + * extent_tree_rdlock()/extent_tree_wrlock()/extent_tree_unlock()). + */ +struct extent_tree_node* extent_tree_iter( + struct extent_tree* extent_tree, + struct extent_tree_node* start) +{ + struct extent_tree_node* next = NULL; + if (start == NULL) { + /* Initial case, no starting node */ + next = RB_MIN(inttree, &extent_tree->head); + return next; + } + + /* + * We were given a valid start node. Look it up to start our traversal + * from there. + */ + next = RB_FIND(inttree, &extent_tree->head, start); + if (!next) { + /* Some kind of error */ + return NULL; + } + + /* Look up our next node */ + next = RB_NEXT(inttree, &extent_tree->head, start); + + return next; +} + +/* + * Lock a extent_tree for reading. This should only be used for calling + * extent_tree_iter(). All the other extent_tree functions provide their + * own locking. + */ +void extent_tree_rdlock(struct extent_tree* extent_tree) +{ + int rc = pthread_rwlock_rdlock(&extent_tree->rwlock); + if (rc) { + LOGERR("pthread_rwlock_rdlock() failed - rc=%d", rc); + } +} + +/* + * Lock a extent_tree for read/write. This should only be used for calling + * extent_tree_iter(). All the other extent_tree functions provide their + * own locking. + */ +void extent_tree_wrlock(struct extent_tree* extent_tree) +{ + int rc = pthread_rwlock_wrlock(&extent_tree->rwlock); + if (rc) { + LOGERR("pthread_rwlock_wrlock() failed - rc=%d", rc); + } +} + +/* + * Unlock a extent_tree for read/write. This should only be used for calling + * extent_tree_iter(). All the other extent_tree functions provide their + * own locking. + */ +void extent_tree_unlock(struct extent_tree* extent_tree) +{ + int rc = pthread_rwlock_unlock(&extent_tree->rwlock); + if (rc) { + LOGERR("pthread_rwlock_unlock() failed - rc=%d", rc); + } +} + +/* + * Remove all nodes in extent_tree, but keep it initialized so you can + * extent_tree_add() to it. + */ +void extent_tree_clear(struct extent_tree* extent_tree) +{ + struct extent_tree_node* node = NULL; + struct extent_tree_node* oldnode = NULL; + + extent_tree_wrlock(extent_tree); + + if (RB_EMPTY(&extent_tree->head)) { + /* extent_tree is empty, nothing to do */ + extent_tree_unlock(extent_tree); + return; + } + + /* Remove and free each node in the tree */ + while ((node = extent_tree_iter(extent_tree, node))) { + if (oldnode) { + RB_REMOVE(inttree, &extent_tree->head, oldnode); + free(oldnode); + } + oldnode = node; + } + if (oldnode) { + RB_REMOVE(inttree, &extent_tree->head, oldnode); + free(oldnode); + } + + extent_tree->count = 0; + extent_tree->max = 0; + extent_tree_unlock(extent_tree); +} + +/* Return the number of segments in the segment tree */ +unsigned long extent_tree_count(struct extent_tree* extent_tree) +{ + extent_tree_rdlock(extent_tree); + unsigned long count = extent_tree->count; + extent_tree_unlock(extent_tree); + return count; +} + +/* Return the maximum ending logical offset in the tree */ +unsigned long extent_tree_max_offset(struct extent_tree* extent_tree) +{ + extent_tree_rdlock(extent_tree); + unsigned long max = extent_tree->max; + extent_tree_unlock(extent_tree); + return max; +} + +/* given an extent tree and starting and ending logical offsets, + * fill in key/value entries that overlap that range, returns at + * most max entries starting from lowest starting offset, + * sets outnum with actual number of entries returned */ +int extent_tree_span( + struct extent_tree* extent_tree, /* extent tree to search */ + int gfid, /* global file id we're looking in */ + unsigned long start, /* starting logical offset */ + unsigned long end, /* ending logical offset */ + int max, /* maximum number of key/vals to return */ + void* _keys, /* array of length max for output keys */ + void* _vals, /* array of length max for output values */ + int* outnum) /* number of entries returned */ +{ + unifyfs_key_t* keys = (unifyfs_key_t*) _keys; + unifyfs_val_t* vals = (unifyfs_val_t*) _vals; + + /* initialize output parameters */ + *outnum = 0; + + /* lock the tree for reading */ + extent_tree_rdlock(extent_tree); + + int count = 0; + struct extent_tree_node* next = extent_tree_find(extent_tree, start, end); + while (next != NULL && + next->start <= end && + count < max) { + /* got an entry that overlaps with given span */ + + /* fill in key */ + unifyfs_key_t* key = &keys[count]; + key->gfid = gfid; + key->offset = next->start; + + /* fill in value */ + unifyfs_val_t* val = &vals[count]; + val->addr = next->pos; + val->len = next->end - next->start + 1; + val->delegator_rank = next->svr_rank; + val->app_id = next->app_id; + val->rank = next->cli_id; + + /* increment the number of key/values we found */ + count++; + + /* get the next element in the tree */ + next = extent_tree_iter(extent_tree, next); + } + + /* return to user the number of key/values we set */ + *outnum = count; + + /* done reading the tree */ + extent_tree_unlock(extent_tree); + + return 0; +} + +static void chunk_req_from_extent( + unsigned long req_offset, + unsigned long req_len, + struct extent_tree_node* n, + chunk_read_req_t* chunk) +{ + unsigned long offset = n->start; + unsigned long nbytes = n->end - n->start + 1; + unsigned long log_offset = n->pos; + unsigned long last = req_offset + req_len - 1; + + if (offset < req_offset) { + unsigned long diff = req_offset - offset; + + offset = req_offset; + log_offset += diff; + nbytes -= diff; + } + + if (n->end > last) { + unsigned long diff = n->end - last; + nbytes -= diff; + } + + chunk->offset = offset; + chunk->nbytes = nbytes; + chunk->log_offset = log_offset; + chunk->rank = n->svr_rank; + chunk->log_client_id = n->cli_id; + chunk->log_app_id = n->app_id; +} + +int extent_tree_get_chunk_list( + struct extent_tree* extent_tree, /* extent tree to search */ + unsigned long offset, /* starting logical offset */ + unsigned long len, /* length of extent */ + unsigned int* n_chunks, /* [out] number of extents returned */ + chunk_read_req_t** chunks) /* [out] extent array */ +{ + int ret = 0; + unsigned int count = 0; + unsigned long end = offset + len - 1; + struct extent_tree_node* first = NULL; + struct extent_tree_node* next = NULL; + chunk_read_req_t* out_chunks = NULL; + chunk_read_req_t* current = NULL; + + extent_tree_rdlock(extent_tree); + + first = extent_tree_find(extent_tree, offset, end); + next = first; + while (next && next->start <= end) { + count++; + next = extent_tree_iter(extent_tree, next); + } + + *n_chunks = count; + if (0 == count) { + goto out_unlock; + } + + out_chunks = calloc(count, sizeof(*out_chunks)); + if (!out_chunks) { + ret = ENOMEM; + goto out_unlock; + } + + next = first; + current = out_chunks; + while (next && next->start <= end) { + /* trim out the extent so it does not include the data that is not + * requested */ + chunk_req_from_extent(offset, len, next, current); + + next = extent_tree_iter(extent_tree, next); + current += 1; + } + + *chunks = out_chunks; + +out_unlock: + extent_tree_unlock(extent_tree); + + return ret; +} + diff --git a/server/src/extent_tree.h b/server/src/extent_tree.h new file mode 100644 index 000000000..4c239784f --- /dev/null +++ b/server/src/extent_tree.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __EXTENT_TREE_H__ +#define __EXTENT_TREE_H__ + +#include + +#include "tree.h" +#include "unifyfs_global.h" + +struct extent_tree_node { + RB_ENTRY(extent_tree_node) entry; + unsigned long start; /* starting logical offset of range */ + unsigned long end; /* ending logical offset of range */ + int svr_rank; /* rank of server hosting data */ + int app_id; /* application id (namespace) on server rank */ + int cli_id; /* client rank on server rank */ + unsigned long pos; /* physical offset of data in log */ +}; + +struct extent_tree { + RB_HEAD(inttree, extent_tree_node) head; + pthread_rwlock_t rwlock; + unsigned long count; /* number of segments stored in tree */ + unsigned long max; /* maximum logical offset value in the tree */ +}; + +/* Returns 0 on success, positive non-zero error code otherwise */ +int extent_tree_init(struct extent_tree* extent_tree); + +/* + * Remove all nodes in extent_tree, but keep it initialized so you can + * extent_tree_add() to it. + */ +void extent_tree_clear(struct extent_tree* extent_tree); + +/* + * Remove and free all nodes in the extent_tree. + */ +void extent_tree_destroy(struct extent_tree* extent_tree); + +/* + * Add an entry to the range tree. Returns 0 on success, nonzero otherwise. + */ +int extent_tree_add( + struct extent_tree* extent_tree, /* tree to add new extent item */ + unsigned long start, /* logical starting offset of extent */ + unsigned long end, /* logical ending offset of extent */ + int svr_rank, /* rank of server hosting data */ + int app_id, /* application id (namespace) on server rank */ + int cli_id, /* client rank on server rank */ + unsigned long pos /* physical offset of data in log */ +); + +/* search tree for entry that overlaps with given start/end + * offsets, return first overlapping entry if found, NULL otherwise, + * assumes caller has lock on tree */ +struct extent_tree_node* extent_tree_find( + struct extent_tree* extent_tree, /* tree to search */ + unsigned long start, /* starting offset to search */ + unsigned long end /* ending offset to search */ +); + +/* truncate extents to use new maximum, discards extent entries + * that exceed the new truncated size, and rewrites any entry + * that overlaps */ +int extent_tree_truncate( + struct extent_tree* extent_tree, /* tree to truncate */ + unsigned long size /* size to truncate extents to */ +); + +/* + * Given a range tree and a starting node, iterate though all the nodes + * in the tree, returning the next one each time. If start is NULL, then + * start with the first node in the tree. + * + * This is meant to be called in a loop, like: + * + * extent_tree_rdlock(extent_tree); + * + * struct extent_tree_node *node = NULL; + * while ((node = extent_tree_iter(extent_tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * extent_tree_unlock(extent_tree); + * + * Note: this function does no locking, and assumes you're properly locking + * and unlocking the extent_tree before doing the iteration (see + * extent_tree_rdlock()/extent_tree_wrlock()/extent_tree_unlock()). + */ +struct extent_tree_node* extent_tree_iter( + struct extent_tree* extent_tree, + struct extent_tree_node* start); + +/* Return the number of segments in the segment tree */ +unsigned long extent_tree_count(struct extent_tree* extent_tree); + +/* Return the maximum ending logical offset in the tree */ +unsigned long extent_tree_max_offset(struct extent_tree* extent_tree); + +/* + * Locking functions for use with extent_tree_iter(). They allow you to + * lock the tree to iterate over it: + * + * extent_tree_rdlock(&extent_tree); + * + * struct extent_tree_node *node = NULL; + * while ((node = extent_tree_iter(extent_tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * extent_tree_unlock(&extent_tree); + */ + +/* + * Lock a extent_tree for reading. This should only be used for calling + * extent_tree_iter(). All the other extent_tree functions provide their + * own locking. + */ +void extent_tree_rdlock(struct extent_tree* extent_tree); + +/* + * Lock a extent_tree for read/write. This should only be used for calling + * extent_tree_iter(). All the other extent_tree functions provide their + * own locking. + */ +void extent_tree_wrlock(struct extent_tree* extent_tree); + +/* + * Unlock a extent_tree for read/write. This should only be used for calling + * extent_tree_iter(). All the other extent_tree functions provide their + * own locking. + */ +void extent_tree_unlock(struct extent_tree* extent_tree); + +/* given an extent tree and starting and ending logical offsets, + * fill in key/value entries that overlap that range, returns at + * most max entries starting from lowest starting offset, + * sets outnum with actual number of entries returned */ +int extent_tree_span( + struct extent_tree* extent_tree, /* extent tree to search */ + int gfid, /* global file id we're looking in */ + unsigned long start, /* starting logical offset */ + unsigned long end, /* ending logical offset */ + int max, /* maximum number of key/vals to return */ + void* keys, /* array of length max for output keys */ + void* vals, /* array of length max for output values */ + int* outnum); /* number of entries returned */ + +int extent_tree_get_chunk_list( + struct extent_tree* extent_tree, /* extent tree to search */ + unsigned long offset, /* starting logical offset */ + unsigned long len, /* length of extent */ + unsigned int* n_chunks, /* [out] number of chunks returned */ + chunk_read_req_t** chunks); /* [out] extent array */ + +/* dump method for debugging extent trees */ +static inline +void extent_tree_dump(struct extent_tree* extent_tree) +{ + if (NULL == extent_tree) { + return; + } + + extent_tree_rdlock(extent_tree); + + struct extent_tree_node* node = NULL; + while ((node = extent_tree_iter(extent_tree, node))) { + LOGDBG("[%lu-%lu]", node->start, node->end); + } + + extent_tree_unlock(extent_tree); +} + +#endif /* __EXTENT_TREE_H__ */ diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 96d9ebdcd..36e74cbfc 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017-2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -21,15 +21,31 @@ // server headers #include "unifyfs_global.h" #include "margo_server.h" +#include "na_config.h" // from mercury include lib // global variables ServerRpcContext_t* unifyfsd_rpc_context; bool margo_use_tcp = true; bool margo_lazy_connect; // = false +int margo_client_server_pool_sz = 4; +int margo_server_server_pool_sz = 4; +int margo_use_progress_thread = 1; +#if defined(NA_HAS_SM) static const char* PROTOCOL_MARGO_SHM = "na+sm://"; -static const char* PROTOCOL_MARGO_VERBS = "ofi+verbs://"; -static const char* PROTOCOL_MARGO_TCP = "bmi+tcp://"; +#else +#error Required Mercury NA shared memory plugin not found (please enable 'SM') +#endif + +#if defined(NA_HAS_BMI) +static const char* PROTOCOL_MARGO_TCP = "bmi+tcp://"; +static const char* PROTOCOL_MARGO_RMA = "bmi+tcp://"; +#elif defined(NA_HAS_OFI) +static const char* PROTOCOL_MARGO_TCP = "ofi+tcp://"; +static const char* PROTOCOL_MARGO_RMA = "ofi+verbs://"; +#else +#error No supported Mercury NA plugin found (please use one of: 'BMI', 'OFI') +#endif /* setup_remote_target - Initializes the server-server margo target */ static margo_instance_id setup_remote_target(void) @@ -45,10 +61,11 @@ static margo_instance_id setup_remote_target(void) if (margo_use_tcp) { margo_protocol = PROTOCOL_MARGO_TCP; } else { - margo_protocol = PROTOCOL_MARGO_VERBS; + margo_protocol = PROTOCOL_MARGO_RMA; } - mid = margo_init(margo_protocol, MARGO_SERVER_MODE, 1, 4); + mid = margo_init(margo_protocol, MARGO_SERVER_MODE, + margo_use_progress_thread, margo_server_server_pool_sz); if (mid == MARGO_INSTANCE_NULL) { LOGERR("margo_init(%s)", margo_protocol); return mid; @@ -82,15 +99,10 @@ static margo_instance_id setup_remote_target(void) /* register server-server RPCs */ static void register_server_server_rpcs(margo_instance_id mid) { - unifyfsd_rpc_context->rpcs.hello_id = - MARGO_REGISTER(mid, "server_hello_rpc", - server_hello_in_t, server_hello_out_t, - server_hello_rpc); - - unifyfsd_rpc_context->rpcs.request_id = - MARGO_REGISTER(mid, "server_request_rpc", - server_request_in_t, server_request_out_t, - server_request_rpc); + unifyfsd_rpc_context->rpcs.server_pid_id = + MARGO_REGISTER(mid, "server_pid_rpc", + server_pid_in_t, server_pid_out_t, + server_pid_rpc); unifyfsd_rpc_context->rpcs.chunk_read_request_id = MARGO_REGISTER(mid, "chunk_read_request_rpc", @@ -101,6 +113,66 @@ static void register_server_server_rpcs(margo_instance_id mid) MARGO_REGISTER(mid, "chunk_read_response_rpc", chunk_read_response_in_t, chunk_read_response_out_t, chunk_read_response_rpc); + + unifyfsd_rpc_context->rpcs.extent_add_id = + MARGO_REGISTER(mid, "add_extents_rpc", + add_extents_in_t, add_extents_out_t, + add_extents_rpc); + + unifyfsd_rpc_context->rpcs.extent_bcast_id = + MARGO_REGISTER(mid, "extent_bcast_rpc", + extent_bcast_in_t, extent_bcast_out_t, + extent_bcast_rpc); + + unifyfsd_rpc_context->rpcs.extent_lookup_id = + MARGO_REGISTER(mid, "find_extents_rpc", + find_extents_in_t, find_extents_out_t, + find_extents_rpc); + + unifyfsd_rpc_context->rpcs.fileattr_bcast_id = + MARGO_REGISTER(mid, "fileattr_bcast_rpc", + fileattr_bcast_in_t, fileattr_bcast_out_t, + fileattr_bcast_rpc); + + unifyfsd_rpc_context->rpcs.filesize_id = + MARGO_REGISTER(mid, "filesize_rpc", + filesize_in_t, filesize_out_t, + filesize_rpc); + + unifyfsd_rpc_context->rpcs.laminate_id = + MARGO_REGISTER(mid, "laminate_rpc", + laminate_in_t, laminate_out_t, + laminate_rpc); + + unifyfsd_rpc_context->rpcs.laminate_bcast_id = + MARGO_REGISTER(mid, "laminate_bcast_rpc", + laminate_bcast_in_t, laminate_bcast_out_t, + laminate_bcast_rpc); + + unifyfsd_rpc_context->rpcs.metaget_id = + MARGO_REGISTER(mid, "metaget_rpc", + metaget_in_t, metaget_out_t, + metaget_rpc); + + unifyfsd_rpc_context->rpcs.metaset_id = + MARGO_REGISTER(mid, "metaset_rpc", + metaset_in_t, metaset_out_t, + metaset_rpc); + + unifyfsd_rpc_context->rpcs.truncate_id = + MARGO_REGISTER(mid, "truncate_rpc", + truncate_in_t, truncate_out_t, + truncate_rpc); + + unifyfsd_rpc_context->rpcs.truncate_bcast_id = + MARGO_REGISTER(mid, "truncate_bcast_rpc", + truncate_bcast_in_t, truncate_bcast_out_t, + truncate_bcast_rpc); + + unifyfsd_rpc_context->rpcs.unlink_bcast_id = + MARGO_REGISTER(mid, "unlink_bcast_rpc", + unlink_bcast_in_t, unlink_bcast_out_t, + unlink_bcast_rpc); } /* setup_local_target - Initializes the client-server margo target */ @@ -113,7 +185,8 @@ static margo_instance_id setup_local_target(void) hg_size_t self_string_sz = sizeof(self_string); margo_instance_id mid; - mid = margo_init(PROTOCOL_MARGO_SHM, MARGO_SERVER_MODE, 1, -1); + mid = margo_init(PROTOCOL_MARGO_SHM, MARGO_SERVER_MODE, + margo_use_progress_thread, margo_client_server_pool_sz); if (mid == MARGO_INSTANCE_NULL) { LOGERR("margo_init(%s)", PROTOCOL_MARGO_SHM); return mid; @@ -147,6 +220,10 @@ static margo_instance_id setup_local_target(void) /* register client-server RPCs */ static void register_client_server_rpcs(margo_instance_id mid) { + MARGO_REGISTER(mid, "unifyfs_attach_rpc", + unifyfs_attach_in_t, unifyfs_attach_out_t, + unifyfs_attach_rpc); + MARGO_REGISTER(mid, "unifyfs_mount_rpc", unifyfs_mount_in_t, unifyfs_mount_out_t, unifyfs_mount_rpc); @@ -171,6 +248,18 @@ static void register_client_server_rpcs(margo_instance_id mid) unifyfs_filesize_in_t, unifyfs_filesize_out_t, unifyfs_filesize_rpc); + MARGO_REGISTER(mid, "unifyfs_truncate_rpc", + unifyfs_truncate_in_t, unifyfs_truncate_out_t, + unifyfs_truncate_rpc); + + MARGO_REGISTER(mid, "unifyfs_unlink_rpc", + unifyfs_unlink_in_t, unifyfs_unlink_out_t, + unifyfs_unlink_rpc); + + MARGO_REGISTER(mid, "unifyfs_laminate_rpc", + unifyfs_laminate_in_t, unifyfs_laminate_out_t, + unifyfs_laminate_rpc); + MARGO_REGISTER(mid, "unifyfs_read_rpc", unifyfs_read_in_t, unifyfs_read_out_t, unifyfs_read_rpc) @@ -193,13 +282,15 @@ int margo_server_rpc_init(void) if (NULL == unifyfsd_rpc_context) { /* create rpc server context */ unifyfsd_rpc_context = calloc(1, sizeof(ServerRpcContext_t)); - assert(unifyfsd_rpc_context); + if (NULL == unifyfsd_rpc_context) { + return ENOMEM; + } } margo_instance_id mid; mid = setup_local_target(); if (mid == MARGO_INSTANCE_NULL) { - rc = UNIFYFS_FAILURE; + rc = UNIFYFS_ERROR_MARGO; } else { unifyfsd_rpc_context->shm_mid = mid; register_client_server_rpcs(mid); @@ -207,7 +298,7 @@ int margo_server_rpc_init(void) mid = setup_remote_target(); if (mid == MARGO_INSTANCE_NULL) { - rc = UNIFYFS_FAILURE; + rc = UNIFYFS_ERROR_MARGO; } else { unifyfsd_rpc_context->svr_mid = mid; register_server_server_rpcs(mid); @@ -233,6 +324,18 @@ int margo_server_rpc_finalize(void) rpc_clean_local_server_addr(); + /* free global server addresses */ + for (int i = 0; i < glb_num_servers; i++) { + if (glb_servers[i].margo_svr_addr != HG_ADDR_NULL) { + margo_addr_free(ctx->svr_mid, glb_servers[i].margo_svr_addr); + glb_servers[i].margo_svr_addr = HG_ADDR_NULL; + } + if (NULL != glb_servers[i].margo_svr_addr_str) { + free(glb_servers[i].margo_svr_addr_str); + glb_servers[i].margo_svr_addr_str = NULL; + } + } + /* shut down margo */ margo_finalize(ctx->svr_mid); /* NOTE: 2nd call to margo_finalize() sometimes crashes - Margo bug? */ @@ -257,42 +360,51 @@ int margo_connect_servers(void) size_t i; hg_return_t hret; + // block until a margo_svr key pair published by all servers + rc = unifyfs_keyval_fence_remote(); + if ((int)UNIFYFS_SUCCESS != rc) { + LOGERR("keyval fence on margo_svr key failed"); + ret = (int)UNIFYFS_FAILURE; + return ret; + } + for (i = 0; i < glb_num_servers; i++) { - int remote_mpi_rank = -1; - char* mpi_rank_str = NULL; + int remote_pmi_rank = -1; + char* pmi_rank_str = NULL; char* margo_addr_str = NULL; - // NOTE: this really doesn't belong here, and will eventually go away - rc = unifyfs_keyval_lookup_remote(i, key_unifyfsd_mpi_rank, - &mpi_rank_str); - if ((int)UNIFYFS_SUCCESS == rc) { - remote_mpi_rank = atoi(mpi_rank_str); - free(mpi_rank_str); - } else { - LOGERR("server index=%zu - MPI rank lookup failed", i); + rc = unifyfs_keyval_lookup_remote(i, key_unifyfsd_pmi_rank, + &pmi_rank_str); + if ((int)UNIFYFS_SUCCESS != rc) { + LOGERR("server index=%zu - pmi rank lookup failed", i); ret = (int)UNIFYFS_FAILURE; + return ret; + } + if (NULL != pmi_rank_str) { + remote_pmi_rank = atoi(pmi_rank_str); + free(pmi_rank_str); } - glb_servers[i].mpi_rank = remote_mpi_rank; + glb_servers[i].pmi_rank = remote_pmi_rank; margo_addr_str = rpc_lookup_remote_server_addr(i); + if (NULL == margo_addr_str) { + LOGERR("server index=%zu - margo server lookup failed", i); + ret = (int)UNIFYFS_FAILURE; + return ret; + } + glb_servers[i].margo_svr_addr = HG_ADDR_NULL; glb_servers[i].margo_svr_addr_str = margo_addr_str; - if (NULL != margo_addr_str) { - LOGDBG("server index=%zu, mpi_rank=%d, margo_addr=%s", - i, remote_mpi_rank, margo_addr_str); - if (!margo_lazy_connect) { - glb_servers[i].margo_svr_addr = HG_ADDR_NULL; - hret = margo_addr_lookup(unifyfsd_rpc_context->svr_mid, - glb_servers[i].margo_svr_addr_str, - &(glb_servers[i].margo_svr_addr)); - if (hret != HG_SUCCESS) { - LOGERR("server index=%zu - margo_addr_lookup(%s) failed", - i, margo_addr_str); - ret = (int)UNIFYFS_FAILURE; - } + LOGDBG("server index=%zu, pmi_rank=%d, margo_addr=%s", + i, remote_pmi_rank, margo_addr_str); + if (!margo_lazy_connect) { + hret = margo_addr_lookup(unifyfsd_rpc_context->svr_mid, + glb_servers[i].margo_svr_addr_str, + &(glb_servers[i].margo_svr_addr)); + if (hret != HG_SUCCESS) { + LOGERR("server index=%zu - margo_addr_lookup(%s) failed", + i, margo_addr_str); + ret = (int)UNIFYFS_FAILURE; } - } else { - LOGERR("server index=%zu - margo addr string lookup failed", i); - ret = (int)UNIFYFS_FAILURE; } } diff --git a/server/src/margo_server.h b/server/src/margo_server.h index 2a9f1a298..d5ba79c5c 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -1,3 +1,17 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + #ifndef _MARGO_SERVER_H #define _MARGO_SERVER_H @@ -16,10 +30,21 @@ #include typedef struct ServerRpcIds { - hg_id_t hello_id; - hg_id_t request_id; hg_id_t chunk_read_request_id; hg_id_t chunk_read_response_id; + hg_id_t extent_add_id; + hg_id_t extent_bcast_id; + hg_id_t extent_lookup_id; + hg_id_t filesize_id; + hg_id_t laminate_id; + hg_id_t laminate_bcast_id; + hg_id_t metaget_id; + hg_id_t metaset_id; + hg_id_t fileattr_bcast_id; + hg_id_t server_pid_id; + hg_id_t truncate_id; + hg_id_t truncate_bcast_id; + hg_id_t unlink_bcast_id; } server_rpcs_t; typedef struct ServerRpcContext { diff --git a/server/src/unifyfs_cmd_handler.c b/server/src/unifyfs_cmd_handler.c index 5df7f5a78..164df9d1a 100644 --- a/server/src/unifyfs_cmd_handler.c +++ b/server/src/unifyfs_cmd_handler.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017-2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -30,144 +30,17 @@ // system headers #include #include -#include // server components #include "unifyfs_global.h" -#include "unifyfs_metadata.h" +#include "unifyfs_metadata_mdhim.h" #include "unifyfs_request_manager.h" // margo rpcs #include "margo_server.h" #include "unifyfs_client_rpcs.h" #include "unifyfs_rpc_util.h" - -/** - * attach to the client-side shared memory - * @param app_config: application information - * @param app_id: the server-side - * @param sock_id: position in poll_set in unifyfs_sock.h - * @return success/error code - */ -static int attach_to_shm(app_config_t* app_config, - int app_id, - int client_side_id) -{ - char shm_name[GEN_STR_LEN] = {0}; - - /* attach shared superblock, a superblock is created by each - * client to store the raw file data. - * The overflowed data are spilled to SSD. */ - - /* define name of superblock region for this client */ - sprintf(shm_name, "%d-super-%d", app_id, client_side_id); - - /* attach to superblock */ - void* addr = unifyfs_shm_alloc(shm_name, app_config->superblock_sz); - if (addr == NULL) { - LOGERR("Failed to attach to superblock %s", shm_name); - return (int)UNIFYFS_ERROR_SHMEM; - } - app_config->shm_superblocks[client_side_id] = addr; - - /* copy name of superblock region */ - strcpy(app_config->super_buf_name[client_side_id], shm_name); - - /* attach shared request buffer, a request buffer is created by each - * client to convey the client-side read request to the delegator */ - - /* define name of request buffer region for this client */ - sprintf(shm_name, "%d-req-%d", app_id, client_side_id); - - /* attach to request buffer region */ - addr = unifyfs_shm_alloc(shm_name, app_config->req_buf_sz); - if (addr == NULL) { - LOGERR("Failed to attach to request buffer %s", shm_name); - return (int)UNIFYFS_ERROR_SHMEM; - } - app_config->shm_req_bufs[client_side_id] = addr; - - /* copy name of request buffer region */ - strcpy(app_config->req_buf_name[client_side_id], shm_name); - - /* initialize shared receive buffer, a request buffer is created - * by each client for the delegator to temporarily buffer the - * received data for this client */ - - /* define name of receive buffer region for this client */ - sprintf(shm_name, "%d-recv-%d", app_id, client_side_id); - - /* attach to request buffer region */ - addr = unifyfs_shm_alloc(shm_name, app_config->recv_buf_sz); - if (addr == NULL) { - LOGERR("Failed to attach to receive buffer %s", shm_name); - return (int)UNIFYFS_ERROR_SHMEM; - } - app_config->shm_recv_bufs[client_side_id] = addr; - shm_header_t* shm_hdr = (shm_header_t*)addr; - pthread_mutex_init(&(shm_hdr->sync), NULL); - shm_hdr->meta_cnt = 0; - shm_hdr->bytes = 0; - shm_hdr->state = SHMEM_REGION_EMPTY; - - /* copy name of request buffer region */ - strcpy(app_config->recv_buf_name[client_side_id], shm_name); - - return UNIFYFS_SUCCESS; -} - -/** - * open spilled log file, spilled log file - * is created once the client-side shared superblock - * overflows. - * @param app_config: application information - * @param app_id: the server-side application id - * @param sock_id: position in poll_set in unifyfs_sock.h - * @return success/error code - */ -static int open_log_file(app_config_t* app_config, - int app_id, int client_side_id) -{ - /* build name to spill over log file, - * have one of these per app_id and client_id, - * client writes data to spill over file when it fills - * memory storage */ - char path[UNIFYFS_MAX_FILENAME] = {0}; - snprintf(path, sizeof(path), "%s/spill_%d_%d.log", - app_config->external_spill_dir, app_id, client_side_id); - - /* copy filename of spill over file into app_config */ - strcpy(app_config->spill_log_name[client_side_id], path); - - /* open spill over file for reading */ - app_config->spill_log_fds[client_side_id] = open(path, O_RDONLY, 0666); - if (app_config->spill_log_fds[client_side_id] < 0) { - printf("rank:%d, opening file %s failure\n", glb_mpi_rank, path); - fflush(stdout); - return (int)UNIFYFS_ERROR_FILE; - } - - /* build name of spill over index file, - * this contains index meta data for data the client wrote to the - * spill over file */ - snprintf(path, sizeof(path), "%s/spill_index_%d_%d.log", - app_config->external_spill_dir, app_id, client_side_id); - - /* copy name of spill over index metadata file to app_config */ - strcpy(app_config->spill_index_log_name[client_side_id], path); - - /* open spill over index file for reading */ - app_config->spill_index_log_fds[client_side_id] = - open(path, O_RDONLY, 0666); - if (app_config->spill_index_log_fds[client_side_id] < 0) { - printf("rank:%d, opening index file %s failure\n", glb_mpi_rank, path); - fflush(stdout); - return (int)UNIFYFS_ERROR_FILE; - } - - return UNIFYFS_SUCCESS; -} - +#include "unifyfs_misc.h" /* BEGIN MARGO CLIENT-SERVER RPC HANDLER FUNCTIONS */ @@ -185,183 +58,164 @@ static int open_log_file(app_config_t* app_config, * client */ static void unifyfs_mount_rpc(hg_handle_t handle) { - int rc; int ret = (int)UNIFYFS_SUCCESS; + int app_id = -1; + int client_id = -1; /* get input params */ unifyfs_mount_in_t in; hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); - - /* read app_id and client_id from input */ - int app_id = in.app_id; - int client_id = in.local_rank_idx; - - /* lookup app_config for given app_id */ - app_config_t* tmp_config = - (app_config_t*) arraylist_get(app_config_list, app_id); - - /* fill in and insert a new entry for this app_id - * if we don't already have one */ - if (tmp_config == NULL) { - /* don't have an app_config for this app_id, - * so allocate and fill one in */ - tmp_config = (app_config_t*)malloc(sizeof(app_config_t)); - - /* record size of shared memory regions */ - tmp_config->req_buf_sz = in.req_buf_sz; - tmp_config->recv_buf_sz = in.recv_buf_sz; - tmp_config->superblock_sz = in.superblock_sz; - - /* record offset and size of index entries */ - tmp_config->meta_offset = in.meta_offset; - tmp_config->meta_size = in.meta_size; - - /* record offset and size of file meta data entries */ - tmp_config->fmeta_offset = in.fmeta_offset; - tmp_config->fmeta_size = in.fmeta_size; - - /* record offset and size of file data */ - tmp_config->data_offset = in.data_offset; - tmp_config->data_size = in.data_size; - - /* record directory holding spill over files */ - strcpy(tmp_config->external_spill_dir, in.external_spill_dir); - - /* record number of clients on this node */ - tmp_config->num_procs_per_node = in.num_procs_per_node; - - /* initialize per-client fields */ - int i; - for (i = 0; i < MAX_NUM_CLIENTS; i++) { - tmp_config->client_ranks[i] = -1; - tmp_config->shm_req_bufs[i] = NULL; - tmp_config->shm_recv_bufs[i] = NULL; - tmp_config->shm_superblocks[i] = NULL; - tmp_config->spill_log_fds[i] = -1; - tmp_config->spill_index_log_fds[i] = -1; - tmp_config->client_addr[i] = HG_ADDR_NULL; + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* read app_id and client_id from input */ + app_id = unifyfs_generate_gfid(in.mount_prefix); + + /* lookup app_config for given app_id */ + app_config* app_cfg = get_application(app_id); + if (app_cfg == NULL) { + /* insert new app_config into our app_configs array */ + LOGDBG("creating new application for app_id=%d", app_id); + app_cfg = new_application(app_id); + if (NULL == app_cfg) { + ret = UNIFYFS_FAILURE; + } + } else { + LOGDBG("using existing app_config for app_id=%d", app_id); } - /* insert new app_config into our list, indexed by app_id */ - rc = arraylist_insert(app_config_list, app_id, tmp_config); - if (rc != 0) { - ret = rc; + if (NULL != app_cfg) { + LOGDBG("creating new app client for %s", in.client_addr_str); + app_client* client = new_app_client(app_cfg, + in.client_addr_str, + in.dbg_rank); + if (NULL == client) { + LOGERR("failed to create new client for app_id=%d dbg_rank=%d", + app_id, (int)in.dbg_rank); + ret = (int)UNIFYFS_FAILURE; + } else { + client_id = client->client_id; + LOGDBG("created new application client %d:%d", + app_id, client_id); + } } - } else { - LOGDBG("using existing app_config for app_id=%d", app_id); - } - - /* convert client_addr_str sent in input struct to margo hg_addr_t, - * which is the address type needed to call rpc functions, etc */ - hret = margo_addr_lookup(unifyfsd_rpc_context->shm_mid, - in.client_addr_str, - &(tmp_config->client_addr[client_id])); - - /* record client id of process on this node */ - tmp_config->client_ranks[client_id] = client_id; - - /* record global rank of client process for debugging */ - tmp_config->dbg_ranks[client_id] = in.dbg_rank; - /* attach to shared memory regions of this client */ - rc = attach_to_shm(tmp_config, app_id, client_id); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("attach_to_shm() failed for app_id=%d client_id=%d rc=%d", - app_id, client_id, rc); - ret = rc; - } - - /* open spill over files for this client */ - rc = open_log_file(tmp_config, app_id, client_id); - if (rc < 0) { - LOGERR("open_log_file() failed for app_id=%d client_id=%d rc=%d", - app_id, client_id, rc); - ret = rc; - } - - /* create request manager thread */ - reqmgr_thrd_t* rm_thrd = unifyfs_rm_thrd_create(app_id, client_id); - if (rm_thrd != NULL) { - /* TODO: seems like it would be cleaner to avoid thread_list - * and instead just record address to struct */ - /* remember id for thread control for this client */ - tmp_config->thrd_idxs[client_id] = rm_thrd->thrd_ndx; - } else { - /* failed to create request manager thread */ - LOGERR("unifyfs_rm_thrd_create() failed for app_id=%d client_id=%d", - app_id, client_id); - ret = UNIFYFS_FAILURE; + margo_free_input(handle, &in); } /* build output structure to return to caller */ unifyfs_mount_out_t out; + out.app_id = (int32_t) app_id; + out.client_id = (int32_t) client_id; out.ret = ret; - out.max_recs_per_slice = max_recs_per_slice; /* send output back to caller */ hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } /* free margo resources */ - margo_free_input(handle, &in); margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unifyfs_mount_rpc) -static void unifyfs_unmount_rpc(hg_handle_t handle) +/* server attaches to client shared memory regions, opens files + * holding spillover data */ +static void unifyfs_attach_rpc(hg_handle_t handle) { + int ret = (int)UNIFYFS_SUCCESS; + /* get input params */ - unifyfs_unmount_in_t in; + unifyfs_attach_in_t in; hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* read app_id and client_id from input */ + int app_id = in.app_id; + int client_id = in.client_id; + + /* lookup client structure and attach it */ + app_client* client = get_app_client(app_id, client_id); + if (NULL != client) { + LOGDBG("attaching client %d:%d", app_id, client_id); + ret = attach_app_client(client, + in.logio_spill_dir, + in.logio_spill_size, + in.logio_mem_size, + in.shmem_data_size, + in.shmem_super_size, + in.meta_offset, + in.meta_size); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("attach_app_client() failed"); + } + } else { + LOGERR("client not found (app_id=%d, client_id=%d)", + app_id, client_id); + ret = (int)UNIFYFS_FAILURE; + } - /* read app_id and client_id from input */ - int app_id = in.app_id; - int client_id = in.local_rank_idx; + margo_free_input(handle, &in); + } /* build output structure to return to caller */ - unifyfs_unmount_out_t out; - out.ret = UNIFYFS_SUCCESS; + unifyfs_attach_out_t out; + out.ret = ret; /* send output back to caller */ hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } /* free margo resources */ - margo_free_input(handle, &in); margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_attach_rpc) - /* lookup app_config for given app_id */ - app_config_t* app_config = - (app_config_t*) arraylist_get(app_config_list, app_id); - - /* get thread id for this client */ - int thrd_id = app_config->thrd_idxs[client_id]; - - /* look up thread control structure */ - reqmgr_thrd_t* thrd_ctrl = rm_get_thread(thrd_id); +static void unifyfs_unmount_rpc(hg_handle_t handle) +{ + int ret = UNIFYFS_SUCCESS; - /* shutdown the delegator thread */ - rm_cmd_exit(thrd_ctrl); + /* get input params */ + unifyfs_unmount_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* read app_id and client_id from input */ + int app_id = in.app_id; + int client_id = in.client_id; + + /* disconnect app client */ + app_client* clnt = get_app_client(app_id, client_id); + if (NULL != clnt) { + ret = disconnect_app_client(clnt); + } else { + LOGERR("application client not found"); + ret = EINVAL; + } - /* detach from the request shared memory */ - if (NULL != app_config->shm_req_bufs[client_id]) { - unifyfs_shm_free(app_config->req_buf_name[client_id], - app_config->req_buf_sz, - (void**)&(app_config->shm_req_bufs[client_id])); + margo_free_input(handle, &in); } - /* detach from the read shared memory buffer */ - if (NULL != app_config->shm_recv_bufs[client_id]) { - unifyfs_shm_free(app_config->recv_buf_name[client_id], - app_config->recv_buf_sz, - (void**)&(app_config->shm_recv_bufs[client_id])); + /* build output structure to return to caller */ + unifyfs_unmount_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); } - /* free margo hg_addr_t client addresses in app_config struct */ - margo_addr_free(unifyfsd_rpc_context->shm_mid, - app_config->client_addr[client_id]); + /* free margo resources */ + margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unifyfs_unmount_rpc) @@ -369,38 +223,59 @@ DEFINE_MARGO_RPC_HANDLER(unifyfs_unmount_rpc) * given a global file id */ static void unifyfs_metaget_rpc(hg_handle_t handle) { - /* get input params */ - unifyfs_metaget_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); - - /* given the global file id, look up file attributes - * from key/value store */ - unifyfs_file_attr_t attr_val; + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; - int ret = unifyfs_get_file_attribute(in.gfid, &attr_val); + /* get input params */ + unifyfs_metaget_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_METAGET; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } - /* build our output values */ - unifyfs_metaget_out_t out; - out.gfid = attr_val.gfid; - out.mode = attr_val.mode; - out.uid = attr_val.uid; - out.gid = attr_val.gid; - out.size = attr_val.size; - out.atime = attr_val.atime; - out.mtime = attr_val.mtime; - out.ctime = attr_val.ctime; - out.filename = attr_val.filename; - out.is_laminated = attr_val.is_laminated; - out.ret = ret; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + /* return to caller */ + unifyfs_metaget_out_t out; + out.ret = (int32_t) ret; + memset(&(out.attr), 0, sizeof(out.attr)); + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_free_input(handle, &in); - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } DEFINE_MARGO_RPC_HANDLER(unifyfs_metaget_rpc) @@ -408,115 +283,385 @@ DEFINE_MARGO_RPC_HANDLER(unifyfs_metaget_rpc) * record key/value entry for this file */ static void unifyfs_metaset_rpc(hg_handle_t handle) { + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; + /* get input params */ - unifyfs_metaset_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); - - /* store file name for given global file id */ - unifyfs_file_attr_t fattr; - memset(&fattr, 0, sizeof(fattr)); - fattr.gfid = in.gfid; - strncpy(fattr.filename, in.filename, sizeof(fattr.filename)); - fattr.mode = in.mode; - fattr.uid = in.uid; - fattr.gid = in.gid; - fattr.size = in.size; - fattr.atime = in.atime; - fattr.mtime = in.mtime; - fattr.ctime = in.ctime; - fattr.is_laminated = in.is_laminated; - - int ret = unifyfs_set_file_attribute(&fattr); + unifyfs_metaset_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_METASET; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } - /* build our output values */ - unifyfs_metaset_out_t out; - out.ret = ret; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } - /* return to caller */ - hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + /* return to caller */ + unifyfs_metaset_out_t out; + out.ret = (int32_t) ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_free_input(handle, &in); - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } DEFINE_MARGO_RPC_HANDLER(unifyfs_metaset_rpc) -/* given app_id, client_id, and a global file id as input, - * read extent location metadata from client shared memory - * and insert corresponding key/value pairs into global index */ +/* given a global file id and client identified by (app_id, client_id) as + * input, read the write extents for the file from the shared memory index + * and update its global metadata */ static void unifyfs_fsync_rpc(hg_handle_t handle) { - /* get input params */ - unifyfs_fsync_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; - /* given global file id, read index metadata from client and - * insert into global index key/value store */ - int ret = rm_cmd_fsync(in.app_id, in.local_rank_idx, in.gfid); + /* get input params */ + unifyfs_fsync_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_SYNC; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } - /* build our output values */ - unifyfs_metaset_out_t out; - out.ret = ret; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } - /* return to caller */ - hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + /* return to caller */ + unifyfs_fsync_out_t out; + out.ret = (int32_t) ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_free_input(handle, &in); - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } DEFINE_MARGO_RPC_HANDLER(unifyfs_fsync_rpc) - /* given an app_id, client_id, global file id, * return current file size */ static void unifyfs_filesize_rpc(hg_handle_t handle) { - /* get input params */ - unifyfs_filesize_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; - /* read data for a single read request from client, - * returns data to client through shared memory */ - size_t filesize; - int ret = rm_cmd_filesize(in.app_id, in.local_rank_idx, - in.gfid, &filesize); + /* get input params */ + unifyfs_filesize_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_FILESIZE; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } - /* build our output values */ - unifyfs_filesize_out_t out; - out.ret = (int32_t) ret; - out.filesize = (hg_size_t) filesize; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } - /* return to caller */ - hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + /* return to caller */ + unifyfs_filesize_out_t out; + out.ret = (int32_t) ret; + out.filesize = (hg_size_t) 0; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_free_input(handle, &in); - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } DEFINE_MARGO_RPC_HANDLER(unifyfs_filesize_rpc) +/* given an app_id, client_id, global file id, + * and file size, truncate file to that size */ +static void unifyfs_truncate_rpc(hg_handle_t handle) +{ + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; + + /* get input params */ + unifyfs_truncate_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_TRUNCATE; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + + /* return to caller */ + unifyfs_truncate_out_t out; + out.ret = (int32_t) ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_truncate_rpc) + +/* given an app_id, client_id, and global file id, + * remove file from system */ +static void unifyfs_unlink_rpc(hg_handle_t handle) +{ + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; + + /* get input params */ + unifyfs_unlink_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_UNLINK; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + + /* return to caller */ + unifyfs_unlink_out_t out; + out.ret = (int32_t) ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } + +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_unlink_rpc) + +/* given an app_id, client_id, and global file id, + * laminate file */ +static void unifyfs_laminate_rpc(hg_handle_t handle) +{ + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; + + /* get input params */ + unifyfs_laminate_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_LAMINATE; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + + /* return to caller */ + unifyfs_laminate_out_t out; + out.ret = (int32_t) ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } + +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_laminate_rpc) + /* given an app_id, client_id, global file id, an offset, and a length, - * initiate read operation to lookup and return data, + * initiate read operation to lookup and return data. * client synchronizes with server again later when data is available * to be copied into user buffers */ static void unifyfs_read_rpc(hg_handle_t handle) { + int ret = (int) UNIFYFS_SUCCESS; + /* get input params */ unifyfs_read_in_t in; hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); - - /* read data for a single read request from client, - * returns data to client through shared memory */ - int ret = rm_cmd_read(in.app_id, in.local_rank_idx, - in.gfid, in.offset, in.length); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* read data for a single read request from client, + * returns data to client through shared memory */ + unifyfs_fops_ctx_t ctx = { + .app_id = in.app_id, + .client_id = in.client_id, + }; + ret = unifyfs_fops_read(&ctx, in.gfid, in.offset, in.length); + + margo_free_input(handle, &in); + } /* build our output values */ unifyfs_read_out_t out; @@ -524,10 +669,11 @@ static void unifyfs_read_rpc(hg_handle_t handle) /* return to caller */ hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } /* free margo resources */ - margo_free_input(handle, &in); margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unifyfs_read_rpc) @@ -539,36 +685,56 @@ DEFINE_MARGO_RPC_HANDLER(unifyfs_read_rpc) * to be copied into user buffers */ static void unifyfs_mread_rpc(hg_handle_t handle) { + int ret = (int) UNIFYFS_SUCCESS; + /* get input params */ unifyfs_mread_in_t in; hg_return_t hret = margo_get_input(handle, &in); - assert(hret == HG_SUCCESS); - - /* allocate buffer to hold array of read requests */ - hg_size_t size = in.bulk_size; - void* buffer = (void*)malloc(size); - assert(buffer); - - /* get pointer to mercury structures to set up bulk transfer */ - const struct hg_info* hgi = margo_get_info(handle); - assert(hgi); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - - /* register local target buffer for bulk access */ - hg_bulk_t bulk_handle; - hret = margo_bulk_create(mid, 1, &buffer, &size, - HG_BULK_WRITE_ONLY, &bulk_handle); - assert(hret == HG_SUCCESS); - - /* get list of read requests */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_handle, 0, bulk_handle, 0, size); - assert(hret == HG_SUCCESS); - - /* initiate read operations to fetch data for read requests */ - int ret = rm_cmd_mread(in.app_id, in.local_rank_idx, - in.read_count, buffer); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* allocate buffer to hold array of read requests */ + hg_size_t size = in.bulk_size; + void* buffer = malloc(size); + if (NULL == buffer) { + ret = ENOMEM; + } else { + /* get pointer to mercury structures to set up bulk transfer */ + const struct hg_info* hgi = margo_get_info(handle); + assert(hgi); + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + /* register local target buffer for bulk access */ + hg_bulk_t bulk_handle; + hret = margo_bulk_create(mid, 1, &buffer, &size, + HG_BULK_WRITE_ONLY, &bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* get list of read requests */ + hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, + in.bulk_handle, 0, bulk_handle, + 0, size); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* initiate read operations to fetch data */ + unifyfs_fops_ctx_t ctx = { + .app_id = in.app_id, + .client_id = in.client_id, + }; + ret = unifyfs_fops_mread(&ctx, in.read_count, buffer); + } + margo_bulk_free(bulk_handle); + } + free(buffer); + } + margo_free_input(handle, &in); + } /* build our output values */ unifyfs_mread_out_t out; @@ -576,12 +742,11 @@ static void unifyfs_mread_rpc(hg_handle_t handle) /* return to caller */ hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } /* free margo resources */ - margo_free_input(handle, &in); - margo_bulk_free(bulk_handle); - free(buffer); margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unifyfs_mread_rpc) diff --git a/server/src/unifyfs_fops.h b/server/src/unifyfs_fops.h new file mode 100644 index 000000000..c57038a43 --- /dev/null +++ b/server/src/unifyfs_fops.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __UNIFYFS_FOPS_H +#define __UNIFYFS_FOPS_H + +#include "unifyfs_configurator.h" +#include "unifyfs_log.h" +#include "unifyfs_meta.h" + +/* + * extra information that we need to pass for file operations. + */ +struct _unifyfs_fops_ctx { + int app_id; + int client_id; +}; +typedef struct _unifyfs_fops_ctx unifyfs_fops_ctx_t; + +typedef int (*unifyfs_fops_init_t)(unifyfs_cfg_t* cfg); + +typedef int (*unifyfs_fops_metaget_t)(unifyfs_fops_ctx_t* ctx, + int gfid, unifyfs_file_attr_t* attr); + +typedef int (*unifyfs_fops_metaset_t)(unifyfs_fops_ctx_t* ctx, + int gfid, int attr_op, + unifyfs_file_attr_t* attr); + +typedef int (*unifyfs_fops_fsync_t)(unifyfs_fops_ctx_t* ctx, int gfid); + +typedef int (*unifyfs_fops_filesize_t)(unifyfs_fops_ctx_t* ctx, + int gfid, size_t* filesize); + +typedef int (*unifyfs_fops_truncate_t)(unifyfs_fops_ctx_t* ctx, + int gfid, off_t len); + +typedef int (*unifyfs_fops_laminate_t)(unifyfs_fops_ctx_t* ctx, int gfid); + +typedef int (*unifyfs_fops_unlink_t)(unifyfs_fops_ctx_t* ctx, int gfid); + +typedef int (*unifyfs_fops_read_t)(unifyfs_fops_ctx_t* ctx, + int gfid, off_t offset, size_t len); + +typedef int (*unifyfs_fops_mread_t)(unifyfs_fops_ctx_t* ctx, + size_t n_req, void* req); + +struct unifyfs_fops { + const char* name; + unifyfs_fops_init_t init; + unifyfs_fops_metaget_t metaget; + unifyfs_fops_metaset_t metaset; + unifyfs_fops_fsync_t fsync; + unifyfs_fops_filesize_t filesize; + unifyfs_fops_truncate_t truncate; + unifyfs_fops_laminate_t laminate; + unifyfs_fops_unlink_t unlink; + unifyfs_fops_read_t read; + unifyfs_fops_mread_t mread; +}; + +/* available file operations. */ +extern struct unifyfs_fops* unifyfs_fops_impl; + +/* the one that is configured to be used: defined in unifyfs_server.c */ +extern struct unifyfs_fops* global_fops_tab; + +static inline int unifyfs_fops_init(unifyfs_cfg_t* cfg) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_fops* fops = unifyfs_fops_impl; + + if (!fops) { + LOGERR("failed to get the file operation table"); + } + + if (fops->init) { + ret = fops->init(cfg); + if (ret) { + LOGERR("failed to initialize fops table (ret=%d)", ret); + return ret; + } + } + + global_fops_tab = fops; + + return ret; +} + +static inline int unifyfs_fops_metaget(unifyfs_fops_ctx_t* ctx, + int gfid, unifyfs_file_attr_t* attr) +{ + if (!global_fops_tab->metaget) { + return ENOSYS; + } + + return global_fops_tab->metaget(ctx, gfid, attr); +} + +static inline int unifyfs_fops_metaset(unifyfs_fops_ctx_t* ctx, + int gfid, int attr_op, + unifyfs_file_attr_t* attr) +{ + if (!global_fops_tab->metaset) { + return ENOSYS; + } + + return global_fops_tab->metaset(ctx, gfid, attr_op, attr); +} + +static inline int unifyfs_fops_fsync(unifyfs_fops_ctx_t* ctx, int gfid) +{ + if (!global_fops_tab->fsync) { + return ENOSYS; + } + + return global_fops_tab->fsync(ctx, gfid); +} + +static inline int unifyfs_fops_filesize(unifyfs_fops_ctx_t* ctx, + int gfid, size_t* filesize) +{ + if (!global_fops_tab->filesize) { + return ENOSYS; + } + + return global_fops_tab->filesize(ctx, gfid, filesize); +} + +static inline int unifyfs_fops_truncate(unifyfs_fops_ctx_t* ctx, + int gfid, off_t len) +{ + if (!global_fops_tab->truncate) { + return ENOSYS; + } + + return global_fops_tab->truncate(ctx, gfid, len); +} + +static inline int unifyfs_fops_laminate(unifyfs_fops_ctx_t* ctx, int gfid) +{ + if (!global_fops_tab->laminate) { + return ENOSYS; + } + + return global_fops_tab->laminate(ctx, gfid); +} + +static inline int unifyfs_fops_unlink(unifyfs_fops_ctx_t* ctx, int gfid) +{ + if (!global_fops_tab->unlink) { + return ENOSYS; + } + + return global_fops_tab->unlink(ctx, gfid); +} + +static inline int unifyfs_fops_read(unifyfs_fops_ctx_t* ctx, + int gfid, off_t offset, size_t len) +{ + if (!global_fops_tab->read) { + return ENOSYS; + } + + LOGDBG("redirecting fops_read (fops_tab: %s)", global_fops_tab->name); + + return global_fops_tab->read(ctx, gfid, offset, len); +} + +static inline int unifyfs_fops_mread(unifyfs_fops_ctx_t* ctx, + size_t n_req, void* req) +{ + if (!global_fops_tab->mread) { + return ENOSYS; + } + + return global_fops_tab->mread(ctx, n_req, req); +} + +#endif /* __UNIFYFS_FOPS_H */ diff --git a/server/src/unifyfs_fops_mdhim.c b/server/src/unifyfs_fops_mdhim.c new file mode 100644 index 000000000..c51c916fa --- /dev/null +++ b/server/src/unifyfs_fops_mdhim.c @@ -0,0 +1,1197 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_group_rpc.h" +#include "unifyfs_metadata_mdhim.h" +#include "unifyfs_request_manager.h" + +/* given an extent corresponding to a write index, create new key/value + * pairs for that extent, splitting into multiple keys at the slice + * range boundaries (meta_slice_sz), it returns the number of + * newly created key/values inserted into the given key and value + * arrays */ +static int split_index( + unifyfs_key_t** keys, /* list to add newly created keys into */ + unifyfs_val_t** vals, /* list to add newly created values into */ + int* keylens, /* list for size of each key */ + int* vallens, /* list for size of each value */ + int gfid, /* global file id of write */ + size_t offset, /* starting byte offset of extent */ + size_t length, /* number of bytes in extent */ + size_t log_offset, /* offset within data log */ + int server_rank, /* rank of server hosting data */ + int app_id, /* app_id holding data */ + int client_rank) /* client rank holding data */ +{ + /* offset of first byte in request */ + size_t pos = offset; + + /* offset of last byte in request */ + size_t last_offset = offset + length - 1; + + /* this will track the current offset within the log + * where the data starts, we advance it with each key + * we generate depending on the data associated with + * each key */ + size_t logpos = log_offset; + + /* iterate over slice ranges and generate a start/end + * pair of keys for each */ + int count = 0; + while (pos <= last_offset) { + /* compute offset for first byte in this slice */ + size_t start = pos; + + /* offset for last byte in this slice, + * assume that's the last byte of the same slice + * containing start, unless that happens to be + * beyond the last byte of the actual request */ + size_t start_slice = start / meta_slice_sz; + size_t end = (start_slice + 1) * meta_slice_sz - 1; + if (end > last_offset) { + end = last_offset; + } + + /* length of extent in this slice */ + size_t len = end - start + 1; + + /* create key to describe this log entry */ + unifyfs_key_t* k = keys[count]; + k->gfid = gfid; + k->offset = start; + keylens[count] = sizeof(unifyfs_key_t); + + /* create value to store address of data */ + unifyfs_val_t* v = vals[count]; + v->addr = logpos; + v->len = len; + v->app_id = app_id; + v->rank = client_rank; + v->delegator_rank = server_rank; + vallens[count] = sizeof(unifyfs_val_t); + + /* advance to next slot in key/value arrays */ + count++; + + /* advance offset into log */ + logpos += len; + + /* advance to first byte offset of next slice */ + pos = end + 1; + } + + /* return number of keys we generated */ + return count; +} + +/* given a global file id, an offset, and a length to read from that + * file, create keys needed to query MDHIM for location of data + * corresponding to that extent, returns the number of keys inserted + * into key array provided by caller */ +static int split_request( + unifyfs_key_t** keys, /* list to add newly created keys into */ + int* keylens, /* list to add byte size of each key */ + int gfid, /* target global file id to read from */ + size_t offset, /* starting offset of read */ + size_t length) /* number of bytes to read */ +{ + /* offset of first byte in request */ + size_t pos = offset; + + /* offset of last byte in request */ + size_t last_offset = offset + length - 1; + + /* iterate over slice ranges and generate a start/end + * pair of keys for each */ + int count = 0; + while (pos <= last_offset) { + /* compute offset for first byte in this segment */ + size_t start = pos; + + /* offset for last byte in this segment, + * assume that's the last byte of the same segment + * containing start, unless that happens to be + * beyond the last byte of the actual request */ + size_t start_slice = start / meta_slice_sz; + size_t end = (start_slice + 1) * meta_slice_sz - 1; + if (end > last_offset) { + end = last_offset; + } + + /* create key to describe first byte we'll read + * in this slice */ + keys[count]->gfid = gfid; + keys[count]->offset = start; + keylens[count] = sizeof(unifyfs_key_t); + count++; + + /* create key to describe last byte we'll read + * in this slice */ + keys[count]->gfid = gfid; + keys[count]->offset = end; + keylens[count] = sizeof(unifyfs_key_t); + count++; + + /* advance to first byte offset of next slice */ + pos = end + 1; + } + + /* return number of keys we generated */ + return count; +} + + +static int mdhim_init(unifyfs_cfg_t* cfg) +{ + int ret = 0; + + LOGDBG("initializing file operations.."); + + ret = meta_init_store(cfg); + if (ret) { + LOGERR("failed to initialize the meta kv store (ret=%d)", ret); + } + + return ret; +} + +static int mdhim_metaget(unifyfs_fops_ctx_t* ctx, + int gfid, unifyfs_file_attr_t* attr) +{ + return unifyfs_get_file_attribute(gfid, attr); +} + +static int mdhim_metaset(unifyfs_fops_ctx_t* ctx, + int gfid, int create, unifyfs_file_attr_t* attr) +{ + return unifyfs_set_file_attribute(create, create, attr); +} + +static int mdhim_fsync(unifyfs_fops_ctx_t* ctx, int gfid) +{ + size_t i; + + /* assume we'll succeed */ + int ret = (int)UNIFYFS_SUCCESS; + + /* get memory page size on this machine */ + int page_sz = getpagesize(); + + /* get application client */ + app_client* client = get_app_client(ctx->app_id, ctx->client_id); + if (NULL == client) { + return EINVAL; + } + + /* get pointer to superblock for this client and app */ + shm_context* super_ctx = client->shmem_super; + if (NULL == super_ctx) { + LOGERR("missing client superblock"); + return UNIFYFS_FAILURE; + } + char* superblk = (char*)(super_ctx->addr); + + /* get pointer to start of key/value region in superblock */ + char* meta = superblk + client->super_meta_offset; + + /* get number of file extent index values client has for us, + * stored as a size_t value in meta region of shared memory */ + size_t extent_num_entries = *(size_t*)(meta); + + /* indices are stored in the superblock shared memory + * created by the client, these are stored as index_t + * structs starting one page size offset into meta region */ + char* ptr_extents = meta + page_sz; + + if (extent_num_entries == 0) { + /* Nothing to do */ + return UNIFYFS_SUCCESS; + } + + unifyfs_index_t* meta_payload = (unifyfs_index_t*)(ptr_extents); + + /* total up number of key/value pairs we'll need for this + * set of index values */ + size_t slices = 0; + for (i = 0; i < extent_num_entries; i++) { + size_t offset = meta_payload[i].file_pos; + size_t length = meta_payload[i].length; + slices += meta_num_slices(offset, length); + } + if (slices >= UNIFYFS_MAX_SPLIT_CNT) { + LOGERR("Error allocating buffers"); + return ENOMEM; + } + + /* pointers to memory we'll dynamically allocate for file extents */ + unifyfs_key_t** keys = NULL; + unifyfs_val_t** vals = NULL; + int* key_lens = NULL; + int* val_lens = NULL; + + /* allocate storage for file extent key/values */ + /* TODO: possibly get this from memory pool */ + keys = alloc_key_array(slices); + vals = alloc_value_array(slices); + key_lens = calloc(slices, sizeof(int)); + val_lens = calloc(slices, sizeof(int)); + if ((NULL == keys) || + (NULL == vals) || + (NULL == key_lens) || + (NULL == val_lens)) { + LOGERR("failed to allocate memory for file extents"); + ret = ENOMEM; + goto mdhim_sync_exit; + } + + /* create file extent key/values for insertion into MDHIM */ + int count = 0; + for (i = 0; i < extent_num_entries; i++) { + /* get file offset, length, and log offset for this entry */ + unifyfs_index_t* meta = &meta_payload[i]; + assert(gfid == meta->gfid); + size_t offset = meta->file_pos; + size_t length = meta->length; + size_t logpos = meta->log_pos; + + /* split this entry at the offset boundaries */ + int used = split_index( + &keys[count], &vals[count], &key_lens[count], &val_lens[count], + gfid, offset, length, logpos, + glb_pmi_rank, ctx->app_id, ctx->client_id); + + /* count up the number of keys we used for this index */ + count += used; + } + + /* batch insert file extent key/values into MDHIM */ + ret = unifyfs_set_file_extents((int)count, + keys, key_lens, vals, val_lens); + if (ret != UNIFYFS_SUCCESS) { + /* TODO: need proper error handling */ + LOGERR("unifyfs_set_file_extents() failed"); + goto mdhim_sync_exit; + } + +mdhim_sync_exit: + /* clean up memory */ + if (NULL != keys) { + free_key_array(keys); + } + + if (NULL != vals) { + free_value_array(vals); + } + + if (NULL != key_lens) { + free(key_lens); + } + + if (NULL != val_lens) { + free(val_lens); + } + + return ret; +} + +static int mdhim_filesize(unifyfs_fops_ctx_t* ctx, int gfid, size_t* outsize) +{ + size_t filesize = 0; + int ret = unifyfs_invoke_filesize_rpc(gfid, &filesize); + if (ret) { + LOGERR("filesize rpc failed (ret=%d)", ret); + } else { + LOGDBG("filesize rpc returned %zu", filesize); + *outsize = filesize; + } + + unifyfs_file_attr_t attr = { 0, }; + mdhim_metaget(ctx, gfid, &attr); + + /* return greater of rpc value and mdhim metadata size */ + size_t asize = (size_t) attr.size; + if (asize > filesize) { + *outsize = asize; + } + + return ret; +} + +/* delete any key whose last byte is beyond the specified + * file size */ +static int truncate_delete_keys( + size_t filesize, /* new file size */ + int num, /* number of entries in keyvals */ + unifyfs_keyval_t* keyvals) /* list of existing key/values */ +{ + /* assume we'll succeed */ + int ret = (int) UNIFYFS_SUCCESS; + + /* pointers to memory we'll dynamically allocate for file extents */ + unifyfs_key_t** unifyfs_keys = NULL; + unifyfs_val_t** unifyfs_vals = NULL; + int* unifyfs_key_lens = NULL; + int* unifyfs_val_lens = NULL; + + /* in the worst case, we'll have to delete all existing keys */ + /* allocate storage for file extent key/values */ + /* TODO: possibly get this from memory pool */ + unifyfs_keys = alloc_key_array(num); + unifyfs_vals = alloc_value_array(num); + unifyfs_key_lens = calloc(num, sizeof(int)); + unifyfs_val_lens = calloc(num, sizeof(int)); + if ((NULL == unifyfs_keys) || + (NULL == unifyfs_vals) || + (NULL == unifyfs_key_lens) || + (NULL == unifyfs_val_lens)) { + LOGERR("failed to allocate memory for file extents"); + ret = ENOMEM; + goto truncate_delete_exit; + } + + /* counter for number of key/values we need to delete */ + int delete_count = 0; + + /* iterate over each key, and if this index extends beyond desired + * file size, create an entry to delete that key */ + int i; + for (i = 0; i < num; i++) { + /* get pointer to next key value pair */ + unifyfs_keyval_t* kv = &keyvals[i]; + + /* get last byte offset for this segment of the file */ + size_t last_offset = kv->key.offset + kv->val.len; + + /* if this segment extends beyond the new file size, + * we need to delete this index entry */ + if (last_offset > filesize) { + /* found an index that extends past end of desired + * file size, get next empty key entry from the pool */ + unifyfs_key_t* key = unifyfs_keys[delete_count]; + + /* define the key to be deleted */ + key->gfid = kv->key.gfid; + key->offset = kv->key.offset; + + /* MDHIM needs to know the byte size of each key and value */ + unifyfs_key_lens[delete_count] = sizeof(unifyfs_key_t); + //unifyfs_val_lens[delete_count] = sizeof(unifyfs_val_t); + + /* increment the number of keys we're deleting */ + delete_count++; + } + } + + /* batch delete file extent key/values from MDHIM */ + if (delete_count > 0) { + ret = unifyfs_delete_file_extents(delete_count, + unifyfs_keys, unifyfs_key_lens); + if (ret != UNIFYFS_SUCCESS) { + /* TODO: need proper error handling */ + LOGERR("unifyfs_delete_file_extents() failed"); + goto truncate_delete_exit; + } + } + +truncate_delete_exit: + /* clean up memory */ + + if (NULL != unifyfs_keys) { + free_key_array(unifyfs_keys); + } + + if (NULL != unifyfs_vals) { + free_value_array(unifyfs_vals); + } + + if (NULL != unifyfs_key_lens) { + free(unifyfs_key_lens); + } + + if (NULL != unifyfs_val_lens) { + free(unifyfs_val_lens); + } + + return ret; +} + +/* rewrite any key that overlaps with new file size, + * we assume the existing key has already been deleted */ +static int truncate_rewrite_keys( + size_t filesize, /* new file size */ + int num, /* number of entries in keyvals */ + unifyfs_keyval_t* keyvals) /* list of existing key/values */ +{ + /* assume we'll succeed */ + int ret = (int) UNIFYFS_SUCCESS; + + /* pointers to memory we'll dynamically allocate for file extents */ + unifyfs_key_t** unifyfs_keys = NULL; + unifyfs_val_t** unifyfs_vals = NULL; + int* unifyfs_key_lens = NULL; + int* unifyfs_val_lens = NULL; + + /* in the worst case, we'll have to rewrite all existing keys */ + /* allocate storage for file extent key/values */ + /* TODO: possibly get this from memory pool */ + unifyfs_keys = alloc_key_array(num); + unifyfs_vals = alloc_value_array(num); + unifyfs_key_lens = calloc(num, sizeof(int)); + unifyfs_val_lens = calloc(num, sizeof(int)); + if ((NULL == unifyfs_keys) || + (NULL == unifyfs_vals) || + (NULL == unifyfs_key_lens) || + (NULL == unifyfs_val_lens)) { + LOGERR("failed to allocate memory for file extents"); + ret = ENOMEM; + goto truncate_rewrite_exit; + } + + /* counter for number of key/values we need to rewrite */ + int count = 0; + + /* iterate over each key, and if this index starts before + * and ends after the desired file size, create an entry + * that ends at new file size */ + int i; + for (i = 0; i < num; i++) { + /* get pointer to next key value pair */ + unifyfs_keyval_t* kv = &keyvals[i]; + + /* get first byte offset for this segment of the file */ + size_t first_offset = kv->key.offset; + + /* get last byte offset for this segment of the file */ + size_t last_offset = kv->key.offset + kv->val.len; + + /* if this segment extends beyond the new file size, + * we need to rewrite this index entry */ + if (first_offset < filesize && + last_offset > filesize) { + /* found an index that overlaps end of desired + * file size, get next empty key entry from the pool */ + unifyfs_key_t* key = unifyfs_keys[count]; + + /* define the key to be rewritten */ + key->gfid = kv->key.gfid; + key->offset = kv->key.offset; + + /* compute new length of this entry */ + size_t newlen = (size_t)(filesize - first_offset); + + /* for the value, we store the log position, the length, + * the host server (delegator rank), the mount point id + * (app id), and the client id (rank) */ + unifyfs_val_t* val = unifyfs_vals[count]; + val->addr = kv->val.addr; + val->len = newlen; + val->delegator_rank = kv->val.delegator_rank; + val->app_id = kv->val.app_id; + val->rank = kv->val.rank; + + /* MDHIM needs to know the byte size of each key and value */ + unifyfs_key_lens[count] = sizeof(unifyfs_key_t); + unifyfs_val_lens[count] = sizeof(unifyfs_val_t); + + /* increment the number of keys we're deleting */ + count++; + } + } + + /* batch set file extent key/values from MDHIM */ + if (count > 0) { + ret = unifyfs_set_file_extents(count, + unifyfs_keys, unifyfs_key_lens, + unifyfs_vals, unifyfs_val_lens); + if (ret != UNIFYFS_SUCCESS) { + /* TODO: need proper error handling */ + LOGERR("unifyfs_set_file_extents() failed"); + goto truncate_rewrite_exit; + } + } + +truncate_rewrite_exit: + /* clean up memory */ + + if (NULL != unifyfs_keys) { + free_key_array(unifyfs_keys); + } + + if (NULL != unifyfs_vals) { + free_value_array(unifyfs_vals); + } + + if (NULL != unifyfs_key_lens) { + free(unifyfs_key_lens); + } + + if (NULL != unifyfs_val_lens) { + free(unifyfs_val_lens); + } + + return ret; +} + +static int mdhim_truncate(unifyfs_fops_ctx_t* ctx, int gfid, off_t len) +{ + size_t newsize = (size_t) len; + + /* set offset and length to request *all* key/value pairs + * for this file */ + size_t offset = 0; + + /* want to pick the highest integer offset value a file + * could have here */ + size_t length = (SIZE_MAX >> 1) - 1; + + /* get the locations of all the read requests from the + * key-value store*/ + unifyfs_key_t key1, key2; + + /* create key to describe first byte we'll read */ + key1.gfid = gfid; + key1.offset = offset; + + /* create key to describe last byte we'll read */ + key2.gfid = gfid; + key2.offset = offset + length - 1; + + /* set up input params to specify range lookup */ + unifyfs_key_t* unifyfs_keys[2] = {&key1, &key2}; + int key_lens[2] = {sizeof(unifyfs_key_t), sizeof(unifyfs_key_t)}; + + /* look up all entries in this range */ + int num_vals = 0; + unifyfs_keyval_t* keyvals = NULL; + int rc = unifyfs_get_file_extents(2, unifyfs_keys, key_lens, + &num_vals, &keyvals); + if (UNIFYFS_SUCCESS != rc) { + /* failed to look up extents, bail with error */ + return UNIFYFS_FAILURE; + } + + /* compute our file size by iterating over each file + * segment and taking the max logical offset */ + int i; + size_t filesize = 0; + for (i = 0; i < num_vals; i++) { + /* get pointer to next key value pair */ + unifyfs_keyval_t* kv = &keyvals[i]; + + /* get last byte offset for this segment of the file */ + size_t last_offset = kv->key.offset + kv->val.len; + + /* update our filesize if this offset is bigger than the current max */ + if (last_offset > filesize) { + filesize = last_offset; + } + } + + /* get filesize as recorded in metadata, which may be bigger if + * user issued an ftruncate on the file to extend it past the + * last write */ + size_t filesize_meta = filesize; + + /* given the global file id, look up file attributes + * from key/value store */ + unifyfs_file_attr_t fattr; + rc = unifyfs_get_file_attribute(gfid, &fattr); + if (rc == UNIFYFS_SUCCESS) { + /* found file attribute for this file, now get its size */ + filesize_meta = fattr.size; + } else { + /* failed to find file attributes for this file */ + goto truncate_exit; + } + + /* take maximum of last write and file size from metadata */ + if (filesize_meta > filesize) { + filesize = filesize_meta; + } + + /* may need to throw away and rewrite keys if shrinking file */ + if (newsize < filesize) { + /* delete any key that extends beyond new file size */ + rc = truncate_delete_keys(newsize, num_vals, keyvals); + if (rc != UNIFYFS_SUCCESS) { + goto truncate_exit; + } + + /* rewrite any key that overlaps new file size */ + rc = truncate_rewrite_keys(newsize, num_vals, keyvals); + if (rc != UNIFYFS_SUCCESS) { + goto truncate_exit; + } + } + + /* update file size field with latest size */ + fattr.size = newsize; + rc = unifyfs_set_file_attribute(1, 0, &fattr); + if (rc != UNIFYFS_SUCCESS) { + /* failed to update file attributes with new file size */ + goto truncate_exit; + } + + rc = unifyfs_invoke_truncate_rpc(gfid, newsize); + if (rc) { + LOGERR("truncate rpc failed"); + } + +truncate_exit: + + /* free off key/value buffer returned from get_file_extents */ + if (NULL != keyvals) { + free(keyvals); + keyvals = NULL; + } + + return rc; +} + +static int mdhim_laminate(unifyfs_fops_ctx_t* ctx, int gfid) +{ + int rc = UNIFYFS_SUCCESS; + + /* given the global file id, look up file attributes + * from key/value store */ + unifyfs_file_attr_t attr = { 0, }; + int ret = mdhim_metaget(ctx, gfid, &attr); + if (ret != UNIFYFS_SUCCESS) { + /* failed to find attributes for the file */ + return ret; + } + + /* if item is not a file, bail with error */ + mode_t mode = (mode_t) attr.mode; + if ((mode & S_IFMT) != S_IFREG) { + /* item is not a regular file */ + LOGERR("ERROR: only regular files can be laminated (gfid=%d)", gfid); + return EINVAL; + } + + /* lookup current file size */ + size_t filesize; + ret = mdhim_filesize(ctx, gfid, &filesize); + if (ret != UNIFYFS_SUCCESS) { + /* failed to get file size for file */ + LOGERR("lamination file size calculation failed (gfid=%d)", gfid); + return ret; + } + + /* update fields in metadata */ + attr.size = filesize; + attr.is_laminated = 1; + + /* update metadata, set size and laminate */ + rc = unifyfs_set_file_attribute(1, 1, &attr); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("lamination metadata update failed (gfid=%d)", gfid); + } + + return rc; +} + +static int mdhim_unlink(unifyfs_fops_ctx_t* ctx, int gfid) +{ + int rc = UNIFYFS_SUCCESS; + + /* given the global file id, look up file attributes + * from key/value store */ + unifyfs_file_attr_t attr; + int ret = unifyfs_get_file_attribute(gfid, &attr); + if (ret != UNIFYFS_SUCCESS) { + /* failed to find attributes for the file */ + return ret; + } + + /* if item is a file, call truncate to free space */ + mode_t mode = (mode_t) attr.mode; + if ((mode & S_IFMT) == S_IFREG) { + /* item is regular file, truncate to 0 */ + ret = mdhim_truncate(ctx, gfid, 0); + if (ret != UNIFYFS_SUCCESS) { + /* failed to delete write extents for file, + * let's leave the file attributes in place */ + return ret; + } + } + + /* delete metadata */ + ret = unifyfs_delete_file_attribute(gfid); + if (ret != UNIFYFS_SUCCESS) { + rc = ret; + } + + rc = unifyfs_invoke_unlink_rpc(gfid); + if (rc) { + LOGERR("unlink rpc failed (ret=%d)", rc); + } + + return rc; +} + + +/* given a set of input key pairs, where each pair describes the first + * and last byte offset of a data range, refer to our local extent map + * and generate keyval responses for any ranges covering data that is + * local to the server, generate new key pairs to describe remaining + * holes that will be queried against the global key/value store, + * the list of output keys, key lengths, and keyvals are allocated + * and returned to be freed by the caller */ +static int get_local_keyvals( + int num_keys, /* number of input keys */ + unifyfs_key_t** keys, /* list of input keys */ + int* keylens, /* list of input key lengths */ + int* out_global, /* number of output keys for server */ + unifyfs_key_t*** out_keys, /* list of output keys */ + int** out_keylens, /* list of output key lengths */ + int* num_keyvals, /* number of output keyvals from local data */ + unifyfs_keyval_t** keyvals) /* list of output keyvals */ +{ + /* initialize output parameters */ + *out_global = 0; + *out_keys = NULL; + *out_keylens = NULL; + *num_keyvals = 0; + *keyvals = NULL; + + /* allocate memory to copy key/value data */ + int max_keyvals = UNIFYFS_MAX_SPLIT_CNT; + unifyfs_keyval_t* kvs_local = (unifyfs_keyval_t*) calloc( + max_keyvals, sizeof(unifyfs_keyval_t)); + if (NULL == kvs_local) { + LOGERR("failed to allocate keyvals"); + return (int)UNIFYFS_ERROR_MDHIM; + } + + /* allocate memory to define remaining keys to + * search in global store */ + unifyfs_key_t** keys_global = alloc_key_array(max_keyvals); + if (NULL == keys_global) { + LOGERR("failed to allocate keys"); + free(kvs_local); + return (int)UNIFYFS_ERROR_MDHIM; + } + + /* allocate memory to define key lengths for remaining keys to + * search in global store */ + int* keylens_global = (int*) calloc(max_keyvals, sizeof(int)); + if (NULL == keylens_global) { + LOGERR("failed to allocate keylens"); + free_key_array(keys_global); + free(kvs_local); + return (int)UNIFYFS_ERROR_MDHIM; + } + + /* counters for the number of local keyvals we create and the + * number of keys we generate for the global key/value store */ + int count_global = 0; + int count_local = 0; + + int i; + for (i = 0; i < num_keys; i += 2) { + /* get next key pair that describe start and end offsets */ + unifyfs_key_t* k1 = keys[i+0]; + unifyfs_key_t* k2 = keys[i+1]; + + /* get gfid, start, and end offset of this pair */ + int gfid = k1->gfid; + size_t start = k1->offset; + size_t end = k2->offset; + + /* we'll define key/values in these temp arrays that correspond + * to extents we have locally */ + unifyfs_key_t tmpkeys[UNIFYFS_MAX_SPLIT_CNT]; + unifyfs_val_t tmpvals[UNIFYFS_MAX_SPLIT_CNT]; + + /* look up any entries we can find in our local extent map */ + int num_local = 0; + int ret = unifyfs_inode_span_extents(gfid, start, end, + UNIFYFS_MAX_SPLIT_CNT, tmpkeys, tmpvals, &num_local); + if (ret) { + LOGERR("failed to span extents (gfid=%d)", gfid); + // now what? + } + + /* iterate over local keys, create new keys to pass to server + * for any holes in our local extents */ + int j; + size_t nextstart = start; + for (j = 0; j < num_local; j++) { + /* get next key/value returned from local extent */ + unifyfs_key_t* k = &tmpkeys[j]; + unifyfs_val_t* v = &tmpvals[j]; + + /* if we have a gap in our data, + * we need to ask the global key/value store */ + if (nextstart < k->offset) { + /* we're missing a section of bytes, so create a key + * pair to search for this hole in the global key/value + * store */ + + /* check that we don't overflow the global array */ + if (count_global + 2 > max_keyvals) { + /* exhausted our space */ + free(keylens_global); + free_key_array(keys_global); + free(kvs_local); + return ENOMEM; + } + + /* first key is for starting offset of the hole, + * which is defined in next start */ + unifyfs_key_t* gk1 = keys_global[count_global]; + gk1->gfid = gfid; + gk1->offset = nextstart; + keylens_global[count_global] = sizeof(unifyfs_key_t); + count_global++; + + /* second key is for ending offset of the hole, + * which will be the offset of the byte that comes + * just before the offset of the current key */ + unifyfs_key_t* gk2 = keys_global[count_global]; + gk2->gfid = gfid; + gk2->offset = k->offset - 1; + keylens_global[count_global] = sizeof(unifyfs_key_t); + count_global++; + } else { + /* otherwise we have a local extent that matches, + * copy the corresponding key/value pair into the + * local output array */ + + /* check that we don't overflow the local array */ + if (count_local + 1 > max_keyvals) { + /* exhausted our space */ + free(keylens_global); + free_key_array(keys_global); + free(kvs_local); + return ENOMEM; + } + + /* create a key/value describing the + * current local extent */ + + /* get pointer to next key/val */ + unifyfs_keyval_t* kv = &kvs_local[count_local]; + + /* copy in the key and value generated from the call + * to tree_span into our array of local key/value pairs */ + memcpy(&kv->key, k, sizeof(unifyfs_key_t)); + memcpy(&kv->val, v, sizeof(unifyfs_val_t)); + + /* increase the number of keyvals we've found locally */ + count_local++; + } + + /* advance to start of next segment we're looking for */ + nextstart = k->offset + v->len; + } + + /* verify that we covered the full range, create a key pair + * to look in the global key/value store for any trailing hole */ + if (nextstart <= end) { + /* check that we don't overflow the global array */ + if (count_global + 2 > max_keyvals) { + /* exhausted our space */ + free(keylens_global); + free_key_array(keys_global); + free(kvs_local); + return ENOMEM; + } + + /* first key is for starting offset of the hole, + * which is defined in next start */ + unifyfs_key_t* gk1 = keys_global[count_global]; + gk1->gfid = gfid; + gk1->offset = nextstart; + keylens_global[count_global] = sizeof(unifyfs_key_t); + count_global++; + + /* second key is for ending offset of the hole */ + unifyfs_key_t* gk2 = keys_global[count_global]; + gk2->gfid = gfid; + gk2->offset = end; + keylens_global[count_global] = sizeof(unifyfs_key_t); + count_global++; + } + } + + /* set output values */ + *out_global = count_global; + *out_keys = keys_global; + *out_keylens = keylens_global; + *num_keyvals = count_local; + *keyvals = kvs_local; + + return UNIFYFS_SUCCESS; +} + +static int create_gfid_chunk_reads(reqmgr_thrd_t* thrd_ctrl, int gfid, + int app_id, int client_id, int num_keys, + unifyfs_key_t** keys, int* keylens) +{ + int rc = UNIFYFS_SUCCESS; + + int num_vals = 0; + unifyfs_keyval_t* keyvals = NULL; + + /* not using our local extent map, + * lookup all keys from global key/value store */ + rc = unifyfs_get_file_extents(num_keys, keys, keylens, + &num_vals, &keyvals); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to lookup keyvals from global key/val store"); + return rc; + } + + /* this is to maintain limits imposed in previous code + * that would throw fatal errors */ + if (num_vals >= UNIFYFS_MAX_SPLIT_CNT || + num_vals >= MAX_META_PER_SEND) { + LOGERR("too many key/values returned in range lookup"); + if (NULL != keyvals) { + free(keyvals); + keyvals = NULL; + } + return ENOMEM; + } + + if (UNIFYFS_SUCCESS != rc) { + /* failed to find any key / value pairs */ + rc = UNIFYFS_FAILURE; + } else { + /* if we get more than one write index entry + * sort them by file id and then by delegator rank */ + if (num_vals > 1) { + qsort(keyvals, (size_t)num_vals, sizeof(unifyfs_keyval_t), + unifyfs_keyval_compare); + } + + server_read_req_t* rdreq = rm_reserve_read_req(thrd_ctrl); + if (NULL == rdreq) { + rc = UNIFYFS_FAILURE; + } else { + rdreq->app_id = app_id; + rdreq->client_id = client_id; + /* TODO: rdreq->extent was removed + * rdreq->extent.gfid = gfid; + * rdreq->extent.errcode = EINPROGRESS; + */ + rc = rm_create_chunk_requests(thrd_ctrl, rdreq, + num_vals, keyvals); + if (rc != (int)UNIFYFS_SUCCESS) { + rm_release_read_req(thrd_ctrl, rdreq); + } + } + } + + /* free off key/value buffer returned from get_file_extents */ + if (NULL != keyvals) { + free(keyvals); + keyvals = NULL; + } + + return rc; +} + +static int mdhim_read(unifyfs_fops_ctx_t* ctx, + int gfid, off_t offset, size_t length) +{ + /* get application client */ + int app_id = ctx->app_id; + int client_id = ctx->client_id; + app_client* client = get_app_client(app_id, client_id); + if (NULL == client) { + return (int)UNIFYFS_FAILURE; + } + + /* get thread control structure */ + reqmgr_thrd_t* thrd_ctrl = client->reqmgr; + + /* get chunks corresponding to requested client read extent + * + * Generate a pair of keys for the read request, representing the start + * and end offset. MDHIM returns all key-value pairs that fall within + * the offset range. + * + * TODO: this is specific to the MDHIM in the source tree and not portable + * to other KV-stores. This needs to be revisited to utilize some + * other mechanism to retrieve all relevant key-value pairs from the + * KV-store. + */ + + /* count number of slices this range covers */ + size_t slices = meta_num_slices(offset, length); + if (slices >= UNIFYFS_MAX_SPLIT_CNT) { + LOGERR("Error allocating buffers"); + return ENOMEM; + } + + /* allocate key storage */ + size_t key_cnt = slices * 2; + unifyfs_key_t** keys = alloc_key_array(key_cnt); + int* key_lens = (int*) calloc(key_cnt, sizeof(int)); + if ((NULL == keys) || + (NULL == key_lens)) { + // this is a fatal error + // TODO: we need better error handling + LOGERR("Error allocating buffers"); + return ENOMEM; + } + + /* split range of read request at boundaries used for + * MDHIM range query */ + split_request(keys, key_lens, gfid, offset, length); + + /* queue up the read operations */ + int rc = create_gfid_chunk_reads(thrd_ctrl, gfid, app_id, client_id, + key_cnt, keys, key_lens); + + /* free memory allocated for key storage */ + free_key_array(keys); + free(key_lens); + + return rc; +} + +static int mdhim_mread(unifyfs_fops_ctx_t* ctx, size_t num_req, void* reqbuf) +{ + int rc = UNIFYFS_SUCCESS; + int app_id = ctx->app_id; + int client_id = ctx->client_id; + unifyfs_extent_t* req; + unifyfs_extent_t* reqs = (unifyfs_extent_t*)reqbuf; + + /* get application client */ + app_client* client = get_app_client(app_id, client_id); + if (NULL == client) { + return (int)UNIFYFS_FAILURE; + } + + /* get thread control structure */ + reqmgr_thrd_t* thrd_ctrl = client->reqmgr; + + /* count up number of slices these request cover */ + int i; + size_t slices = 0; + for (i = 0; i < num_req; i++) { + req = reqs + i; + + /* get offset and length of next request */ + size_t off = req->offset; + size_t len = req->length; + + /* add in number of slices this request needs */ + slices += meta_num_slices(off, len); + } + if (slices >= UNIFYFS_MAX_SPLIT_CNT) { + LOGERR("Error allocating buffers"); + return ENOMEM; + } + + /* allocate key storage */ + size_t key_cnt = slices * 2; + unifyfs_key_t** keys = alloc_key_array(key_cnt); + int* key_lens = (int*) calloc(key_cnt, sizeof(int)); + if ((NULL == keys) || + (NULL == key_lens)) { + // this is a fatal error + // TODO: we need better error handling + LOGERR("Error allocating buffers"); + return ENOMEM; + } + + /* get chunks corresponding to requested client read extents */ + int ret; + int num_keys = 0; + int last_gfid = -1; + for (i = 0; i < num_req; i++) { + req = reqs + i; + + /* get the file id for this request */ + int gfid = req->gfid; + + /* if we have switched to a different file, create chunk reads + * for the previous file */ + if (i && (gfid != last_gfid)) { + /* create requests for all extents of last_gfid */ + ret = create_gfid_chunk_reads(thrd_ctrl, last_gfid, + app_id, client_id, + num_keys, keys, key_lens); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("Error creating chunk reads for gfid=%d", last_gfid); + rc = ret; + } + + /* reset key counter for the current gfid */ + num_keys = 0; + } + + /* get offset and length of current read request */ + size_t off = req->offset; + size_t len = req->length; + LOGDBG("gfid:%d, offset:%zu, length:%zu", gfid, off, len); + + /* Generate a pair of keys for each read request, representing + * the start and end offsets. MDHIM returns all key-value pairs that + * fall within the offset range. + * + * TODO: this is specific to the MDHIM in the source tree and not + * portable to other KV-stores. This needs to be revisited to + * utilize some other mechanism to retrieve all relevant KV + * pairs from the KV-store. + */ + + /* split range of read request at boundaries used for + * MDHIM range query */ + int used = split_request(&keys[num_keys], &key_lens[num_keys], + gfid, off, len); + num_keys += used; + + /* keep track of the last gfid value that we processed */ + last_gfid = gfid; + } + + /* create requests for all extents of final gfid */ + ret = create_gfid_chunk_reads(thrd_ctrl, last_gfid, + app_id, client_id, + num_keys, keys, key_lens); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("Error creating chunk reads for gfid=%d", last_gfid); + rc = ret; + } + + /* free memory allocated for key storage */ + free_key_array(keys); + free(key_lens); + + return rc; +} + +static struct unifyfs_fops _fops_mdhim = { + .name = "mdhim", + .init = mdhim_init, + .metaget = mdhim_metaget, + .metaset = mdhim_metaset, + .fsync = mdhim_fsync, + .filesize = mdhim_filesize, + .truncate = mdhim_truncate, + .laminate = mdhim_laminate, + .unlink = mdhim_unlink, + .read = mdhim_read, + .mread = mdhim_mread, +}; + +struct unifyfs_fops* unifyfs_fops_impl = &_fops_mdhim; + diff --git a/server/src/unifyfs_fops_rpc.c b/server/src/unifyfs_fops_rpc.c new file mode 100644 index 000000000..0492e5885 --- /dev/null +++ b/server/src/unifyfs_fops_rpc.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_inode_tree.h" +#include "unifyfs_inode.h" +#include "unifyfs_group_rpc.h" +#include "unifyfs_p2p_rpc.h" +#include "unifyfs_request_manager.h" + + +static +int rpc_init(unifyfs_cfg_t* cfg) +{ + int ret = 0; + long range_sz = 0; + + LOGDBG("initializing file operations.."); + + ret = configurator_int_val(cfg->meta_range_size, &range_sz); + if (ret != 0) { + LOGERR("failed to read configuration (meta_range_size)"); + } + meta_slice_sz = (size_t) range_sz; + + return ret; +} + +static +int rpc_metaget(unifyfs_fops_ctx_t* ctx, + int gfid, + unifyfs_file_attr_t* attr) +{ + return unifyfs_invoke_metaget_rpc(gfid, attr); +} + +static +int rpc_metaset(unifyfs_fops_ctx_t* ctx, + int gfid, + int attr_op, + unifyfs_file_attr_t* attr) +{ + return unifyfs_invoke_metaset_rpc(gfid, attr_op, attr); +} + +/* + * sync rpc from client contains extents for a single gfid (file). + */ +static +int rpc_fsync(unifyfs_fops_ctx_t* ctx, + int gfid) +{ + size_t i; + + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + /* get memory page size on this machine */ + int page_sz = getpagesize(); + + /* get application client */ + app_client* client = get_app_client(ctx->app_id, ctx->client_id); + if (NULL == client) { + return EINVAL; + } + + /* get pointer to superblock for this client and app */ + shm_context* super_ctx = client->shmem_super; + if (NULL == super_ctx) { + LOGERR("missing client superblock"); + return UNIFYFS_FAILURE; + } + char* superblk = (char*)(super_ctx->addr); + + /* get pointer to start of key/value region in superblock */ + char* meta = superblk + client->super_meta_offset; + + /* get number of file extent index values client has for us, + * stored as a size_t value in meta region of shared memory */ + size_t num_extents = *(size_t*)(meta); + + /* indices are stored in the superblock shared memory + * created by the client, these are stored as index_t + * structs starting one page size offset into meta region + * + * Is it safe to assume that the index information in this superblock is + * not going to be modified by the client while we perform this operation? + */ + char* ptr_extents = meta + page_sz; + + if (num_extents == 0) { + return UNIFYFS_SUCCESS; /* Nothing to do */ + } + + unifyfs_index_t* meta_payload = (unifyfs_index_t*)(ptr_extents); + + struct extent_tree_node* extents = calloc(num_extents, sizeof(*extents)); + if (!extents) { + LOGERR("failed to allocate memory for local_extents"); + return ENOMEM; + } + + /* the sync rpc now contains extents from a single file/gfid */ + assert(gfid == meta_payload[0].gfid); + + for (i = 0; i < num_extents; i++) { + struct extent_tree_node* extent = &extents[i]; + unifyfs_index_t* meta = &meta_payload[i]; + + extent->start = meta->file_pos; + extent->end = (meta->file_pos + meta->length) - 1; + extent->svr_rank = glb_pmi_rank; + extent->app_id = ctx->app_id; + extent->cli_id = ctx->client_id; + extent->pos = meta->log_pos; + } + + /* update local inode state first */ + ret = unifyfs_inode_add_extents(gfid, num_extents, extents); + if (ret) { + LOGERR("failed to add local extents (gfid=%d, ret=%d)", gfid, ret); + return ret; + } + + /* then update owner inode state */ + ret = unifyfs_invoke_add_extents_rpc(gfid, num_extents, extents); + if (ret) { + LOGERR("failed to add extents (gfid=%d, ret=%d)", gfid, ret); + } + + return ret; +} + +static +int rpc_filesize(unifyfs_fops_ctx_t* ctx, + int gfid, + size_t* filesize) +{ + return unifyfs_invoke_filesize_rpc(gfid, filesize); +} + +static +int rpc_truncate(unifyfs_fops_ctx_t* ctx, + int gfid, + off_t len) +{ + return unifyfs_invoke_truncate_rpc(gfid, len); +} + +static +int rpc_laminate(unifyfs_fops_ctx_t* ctx, + int gfid) +{ + return unifyfs_invoke_laminate_rpc(gfid); +} + +static +int rpc_unlink(unifyfs_fops_ctx_t* ctx, + int gfid) +{ + return unifyfs_invoke_broadcast_unlink(gfid); +} + +static +int create_remote_read_requests(unsigned int n_chunks, + chunk_read_req_t* chunks, + unsigned int* outlen, + server_chunk_reads_t** out) +{ + int prev_rank = -1; + unsigned int num_server_reads = 0; + unsigned int i = 0; + server_chunk_reads_t* remote_reads = NULL; + server_chunk_reads_t* current = NULL; + chunk_read_req_t* pos = NULL; + + /* count how many servers we need to contact */ + for (i = 0; i < n_chunks; i++) { + chunk_read_req_t* curr_chunk = &chunks[i]; + int curr_rank = curr_chunk->rank; + if (curr_rank != prev_rank) { + num_server_reads++; + } + prev_rank = curr_rank; + } + + /* allocate and fill the per-server request data structure */ + remote_reads = (server_chunk_reads_t*) calloc(num_server_reads, + sizeof(*remote_reads)); + if (!remote_reads) { + LOGERR("failed to allocate memory for remote_reads"); + return ENOMEM; + } + + pos = chunks; + unsigned int processed = 0; + + LOGDBG("preparing remote read request for %u chunks (%d servers)", + n_chunks, num_server_reads); + + for (i = 0; i < num_server_reads; i++) { + int rank = pos->rank; + + current = &remote_reads[i]; + current->rank = rank; + current->reqs = pos; + + for ( ; processed < n_chunks; pos++) { + if (pos->rank != rank) { + break; + } + current->total_sz += pos->nbytes; + current->num_chunks++; + processed++; + } + + LOGDBG("%u/%u chunks processed: server %d (%u chunks, %zu bytes)", + processed, n_chunks, rank, + current->num_chunks, current->total_sz); + } + + *outlen = num_server_reads; + *out = remote_reads; + return UNIFYFS_SUCCESS; +} + +static +int submit_read_request(unifyfs_fops_ctx_t* ctx, + size_t count, + unifyfs_inode_extent_t* extents) +{ + if ((count == 0) || (NULL == extents)) { + return EINVAL; + } + + LOGDBG("handling read request (%zu chunk requests)", count); + + /* see if we have a valid app information */ + int app_id = ctx->app_id; + int client_id = ctx->client_id; + + /* get application client */ + app_client* client = get_app_client(app_id, client_id); + if (NULL == client) { + return UNIFYFS_FAILURE; + } + + /* group requested extents by gfid */ + int ret = UNIFYFS_SUCCESS; + int extent_ndx = 0; + while (extent_ndx < (int)count) { + int curr_ndx = extent_ndx; + int curr_gfid = extents[extent_ndx].gfid; + + /* get count of extents for current gfid */ + unsigned int curr_count = 0; + while ((curr_ndx < count) && (extents[curr_ndx].gfid == curr_gfid)) { + curr_count++; + curr_ndx++; + } + + unsigned int n_chunks = 0; + chunk_read_req_t* chunks = NULL; + int rc = unifyfs_invoke_find_extents_rpc(curr_gfid, curr_count, + extents + extent_ndx, + &n_chunks, &chunks); + if (rc) { + LOGERR("failed to find extent locations"); + return rc; + } + if (n_chunks > 0) { + /* prepare the read request requests */ + unsigned int n_remote_reads = 0; + server_chunk_reads_t* remote_reads = NULL; + rc = create_remote_read_requests(n_chunks, chunks, + &n_remote_reads, &remote_reads); + if (rc) { + LOGERR("failed to prepare the remote read requests"); + if (NULL != chunks) { + free(chunks); + } + return rc; + } + + /* fill the information of server_read_req_t and submit */ + server_read_req_t rdreq = { 0, }; + rdreq.app_id = app_id; + rdreq.client_id = client_id; + rdreq.chunks = chunks; + rdreq.num_server_reads = (int) n_remote_reads; + rdreq.remote_reads = remote_reads; + ret = rm_submit_read_request(&rdreq); + } else { + ret = ENODATA; + } + + /* advance to next group */ + extent_ndx += curr_count; + } + + return ret; +} + +static +int rpc_read(unifyfs_fops_ctx_t* ctx, + int gfid, + off_t offset, + size_t length) +{ + unifyfs_inode_extent_t chunk = { 0, }; + + chunk.gfid = gfid; + chunk.offset = offset; + chunk.length = length; + + return submit_read_request(ctx, 1, &chunk); +} + +static +int rpc_mread(unifyfs_fops_ctx_t* ctx, + size_t n_req, + void* read_reqs) +{ + int ret = UNIFYFS_SUCCESS; + int i = 0; + unifyfs_inode_extent_t* chunks = NULL; + unifyfs_extent_t* reqs = (unifyfs_extent_t*) read_reqs; + + chunks = calloc(n_req, sizeof(*chunks)); + if (!chunks) { + LOGERR("failed to allocate the chunk request"); + return ENOMEM; + } + + for (i = 0; i < (int)n_req; i++) { + unifyfs_inode_extent_t* ch = chunks + i; + unifyfs_extent_t* req = reqs + i; + ch->gfid = req->gfid; + ch->offset = req->offset; + ch->length = req->length; + } + + ret = submit_read_request(ctx, n_req, chunks); + + if (chunks) { + free(chunks); + chunks = NULL; + } + + return ret; +} + +static struct unifyfs_fops _fops_rpc = { + .name = "rpc", + .init = rpc_init, + .metaget = rpc_metaget, + .metaset = rpc_metaset, + .fsync = rpc_fsync, + .filesize = rpc_filesize, + .truncate = rpc_truncate, + .laminate = rpc_laminate, + .unlink = rpc_unlink, + .read = rpc_read, + .mread = rpc_mread, +}; + +struct unifyfs_fops* unifyfs_fops_impl = &_fops_rpc; diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index 9e5f9c01e..6b4bed1e4 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -29,6 +29,7 @@ #ifndef UNIFYFS_GLOBAL_H #define UNIFYFS_GLOBAL_H +#include // system headers #include @@ -40,120 +41,76 @@ #include #include #include +#include +#include // common headers #include "arraylist.h" #include "unifyfs_const.h" #include "unifyfs_log.h" +#include "unifyfs_logio.h" #include "unifyfs_meta.h" #include "unifyfs_shm.h" -#include "unifyfs_sock.h" +#include "unifyfs_fops.h" +#include "unifyfs_client_rpcs.h" +#include "unifyfs_server_rpcs.h" -#include -#include -extern arraylist_t* app_config_list; -extern arraylist_t* rm_thrd_list; +/* Some global variables/structures used throughout the server code */ +/* PMI server rank and server count */ +extern int glb_pmi_rank; +extern int glb_pmi_size; +extern int server_pid; + +/* hostname for this server */ extern char glb_host[UNIFYFS_MAX_HOSTNAME]; -extern int glb_mpi_rank, glb_mpi_size; -extern size_t max_recs_per_slice; +typedef struct { + //char* hostname; + char* margo_svr_addr_str; + hg_addr_t margo_svr_addr; + int pmi_rank; +} server_info_t; + +extern server_info_t* glb_servers; /* array of server info structs */ +extern size_t glb_num_servers; /* number of entries in glb_servers array */ + +extern struct unifyfs_inode_tree* global_inode_tree; /* global inode tree */ /* defines commands for messages sent to service manager threads */ typedef enum { SVC_CMD_INVALID = 0, - SVC_CMD_RDREQ_MSG = 1, /* read requests (send_msg_t) */ SVC_CMD_RDREQ_CHK, /* read requests (chunk_read_req_t) */ - SVC_CMD_EXIT, /* service manager thread should exit */ } service_cmd_e; -typedef enum { - READ_REQUEST_TAG = 5001, - READ_RESPONSE_TAG = 6001, - CHUNK_REQUEST_TAG = 7001, - CHUNK_RESPONSE_TAG = 8001 -} service_tag_e; - -/* this defines a read request as sent from the request manager to the - * service manager, it contains info about the physical location of - * the data: - * - * dest_delegator_rank - rank of delegator hosting data log file - * dest_app_id, dest_client_id - defines file on host delegator - * dest_offset - phyiscal offset of data in log file - * length - number of bytes to be read - * - * it also contains a return address to use in the read reply that - * the service manager sends back to the request manager: - * - * src_delegator_rank - rank of requesting delegator process - * src_thrd - thread id of request manager (used to compute MPI tag) - * src_app_id, src_cli_id - * src_fid - global file id - * src_offset - starting offset in logical file - * length - number of bytes - * src_dbg_rank - rank of application process making the request - * - * the arrival_time field is included but not set by the request - * manager, it is used to tag the time the request reaches the - * service manager for prioritizing read replies */ -typedef struct { - int dest_app_id; /* app id of log file */ - int dest_client_id; /* client id of log file */ - size_t dest_offset; /* data offset within log file */ - int dest_delegator_rank; /* delegator rank of service manager */ - size_t length; /* length of data to be read */ - int src_delegator_rank; /* delegator rank of request manager */ - int src_cli_id; /* client id of requesting client process */ - int src_app_id; /* app id of requesting client process */ - int src_fid; /* global file id */ - size_t src_offset; /* logical file offset */ - int src_thrd; /* thread id of request manager */ - int src_dbg_rank; /* MPI rank of client process */ - int arrival_time; /* records time reaches service mgr */ -} send_msg_t; - -/* defines header for read reply messages sent from service manager - * back to request manager, data payload of length bytes immediately - * follows the header */ -typedef struct { - size_t src_offset; /* file offset */ - size_t length; /* number of bytes */ - int src_fid; /* global file id */ - int errcode; /* indicates whether read was successful */ -} recv_msg_t; - -/* defines a fixed-length list of read requests */ -typedef struct { - int num; /* number of active read requests */ - send_msg_t msg_meta[MAX_META_PER_SEND]; /* list of requests */ -} msg_meta_t; - // NEW READ REQUEST STRUCTURES typedef enum { - READREQ_INIT = 0, + READREQ_NULL = 0, /* request not initialized */ + READREQ_READY, /* request ready to be issued */ READREQ_STARTED, /* chunk requests issued */ - READREQ_PARTIAL_COMPLETE, /* some reads completed */ - READREQ_COMPLETE /* all reads completed */ + READREQ_COMPLETE, /* all reads completed */ } readreq_status_e; typedef struct { + int gfid; /* gfid */ size_t nbytes; /* size of data chunk */ size_t offset; /* file offset */ size_t log_offset; /* remote log offset */ int log_app_id; /* remote log application id */ int log_client_id; /* remote log client id */ + int rank; /* remote server rank who holds data */ } chunk_read_req_t; typedef struct { + int gfid; /* gfid */ size_t offset; /* file offset */ size_t nbytes; /* requested read size */ ssize_t read_rc; /* bytes read (or negative error code) */ } chunk_read_resp_t; typedef struct { - int rank; /* remote delegator rank */ + int rank; /* server rank */ int rdreq_id; /* read-request id */ int app_id; /* app id of requesting client process */ int client_id; /* client id of requesting client process */ @@ -164,7 +121,7 @@ typedef struct { * @SM: received requests buffer */ chunk_read_resp_t* resp; /* @RM: received responses buffer * @SM: allocated responses buffer */ -} remote_chunk_reads_t; +} server_chunk_reads_t; typedef struct { size_t length; /* length of data to read */ @@ -173,81 +130,73 @@ typedef struct { int errcode; /* request completion status */ } client_read_req_t; -/* one of these structures is created for each app id, - * it contains info for each client like names, file descriptors, - * and memory locations of file data - * - * file data stored in the superblock is in memory, - * this is mapped as a shared memory region by the delegator - * process, this data can be accessed by service manager threads - * using memcpy() - * - * when the super block is full, file data is written - * to the spillover file, data here can be accessed by - * service manager threads via read() calls */ -typedef struct { - /* global values which are identical across all clients, - * for this given app id */ - size_t superblock_sz; /* size of memory region used to store data */ - size_t meta_offset; /* superblock offset to index metadata */ - size_t meta_size; /* size of index metadata region in bytes */ - size_t fmeta_offset; /* superblock offset to file attribute metadata */ - size_t fmeta_size; /* size of file attribute metadata region in bytes */ - size_t data_offset; /* superblock offset to data log */ - size_t data_size; /* size of data log in bytes */ - size_t req_buf_sz; /* buffer size for client to issue read requests */ - size_t recv_buf_sz; /* buffer size for read replies to client */ - - /* number of clients on the node */ - int num_procs_per_node; - - /* map from socket id to other values */ - int client_ranks[MAX_NUM_CLIENTS]; /* map to client id */ - int thrd_idxs[MAX_NUM_CLIENTS]; /* map to thread id */ - int dbg_ranks[MAX_NUM_CLIENTS]; /* map to client rank */ - - /* file descriptors */ - int spill_log_fds[MAX_NUM_CLIENTS]; /* spillover data */ - int spill_index_log_fds[MAX_NUM_CLIENTS]; /* spillover index */ - - /* shared memory pointers */ - char* shm_superblocks[MAX_NUM_CLIENTS]; /* superblock data */ - char* shm_req_bufs[MAX_NUM_CLIENTS]; /* read request shm */ - char* shm_recv_bufs[MAX_NUM_CLIENTS]; /* read reply shm */ - - /* client address for rpc invocation */ - hg_addr_t client_addr[MAX_NUM_CLIENTS]; - - /* file names */ - char super_buf_name[MAX_NUM_CLIENTS][UNIFYFS_MAX_FILENAME]; - char req_buf_name[MAX_NUM_CLIENTS][UNIFYFS_MAX_FILENAME]; - char recv_buf_name[MAX_NUM_CLIENTS][UNIFYFS_MAX_FILENAME]; - char spill_log_name[MAX_NUM_CLIENTS][UNIFYFS_MAX_FILENAME]; - char spill_index_log_name[MAX_NUM_CLIENTS][UNIFYFS_MAX_FILENAME]; - - /* directory holding spill over files */ - char external_spill_dir[UNIFYFS_MAX_FILENAME]; -} app_config_t; - -typedef int fattr_key_t; +// forward declaration of reqmgr_thrd +struct reqmgr_thrd; -typedef struct { - char fname[UNIFYFS_MAX_FILENAME]; - struct stat file_attr; -} fattr_val_t; +/** + * Structure to maintain application client state, including + * logio and shared memory contexts, margo rpc address, etc. + */ +typedef struct app_client { + int app_id; /* index of app in server app_configs array */ + int client_id; /* this client's index in app's clients array */ + int dbg_rank; /* client debug rank - NOT CURRENTLY USED */ + int connected; /* is client currently connected? */ -int invert_sock_ids[MAX_NUM_CLIENTS]; + hg_addr_t margo_addr; /* client Margo address */ -typedef struct { - //char* hostname; - char* margo_svr_addr_str; - hg_addr_t margo_svr_addr; - int mpi_rank; -} server_info_t; + struct reqmgr_thrd* reqmgr; /* this client's request manager thread */ -extern char glb_host[UNIFYFS_MAX_HOSTNAME]; -extern size_t glb_num_servers; -extern server_info_t* glb_servers; + logio_context* logio; /* logio context for write data */ + + shm_context* shmem_data; /* shmem context for read data */ + + shm_context* shmem_super; /* shmem context for superblock region */ + size_t super_meta_offset; /* superblock offset to index metadata */ + size_t super_meta_size; /* size of index metadata region in bytes */ +} app_client; + +/** + * Structure to maintain application configuration state + * and track connected clients. + */ +typedef struct app_config { + /* application id - MD5(mount_prefix) */ + int app_id; + + /* mount prefix for application's UnifyFS files */ + char mount_prefix[UNIFYFS_MAX_FILENAME]; + + /* array of clients associated with this app */ + size_t num_clients; + size_t clients_sz; + app_client** clients; +} app_config; + +app_config* get_application(int app_id); + +app_config* new_application(int app_id); + +unifyfs_rc cleanup_application(app_config* app); + +app_client* get_app_client(int app_id, + int client_id); + +app_client* new_app_client(app_config* app, + const char* margo_addr_str, + const int dbg_rank); + +unifyfs_rc attach_app_client(app_client* client, + const char* logio_spill_dir, + const size_t logio_spill_size, + const size_t logio_shmem_size, + const size_t shmem_data_size, + const size_t shmem_super_size, + const size_t super_meta_offset, + const size_t super_meta_size); + +unifyfs_rc disconnect_app_client(app_client* clnt); +unifyfs_rc cleanup_app_client(app_config* app, app_client* clnt); #endif // UNIFYFS_GLOBAL_H diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c new file mode 100644 index 000000000..9c308441a --- /dev/null +++ b/server/src/unifyfs_group_rpc.c @@ -0,0 +1,1147 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_global.h" +#include "unifyfs_tree.h" +#include "margo_server.h" +#include "unifyfs_server_rpcs.h" +#include "unifyfs_group_rpc.h" + +#ifndef UNIFYFS_BCAST_K_ARY +# define UNIFYFS_BCAST_K_ARY 2 +#endif + +/* server collective (coll) margo request structure */ +typedef struct { + margo_request request; + hg_handle_t handle; +} coll_request; + +/* helper method to initialize collective request rpc handle for child peer */ +static int get_request_handle(hg_id_t request_hgid, + int peer_rank, + coll_request* creq) +{ + int rc = UNIFYFS_SUCCESS; + + /* get address for specified server rank */ + hg_addr_t addr = glb_servers[peer_rank].margo_svr_addr; + + /* get handle to rpc function */ + hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, + request_hgid, &(creq->handle)); + if (hret != HG_SUCCESS) { + LOGERR("failed to get handle for request(%p) to server %d", + creq, peer_rank); + rc = UNIFYFS_ERROR_MARGO; + } + + return rc; +} + +/* helper method to forward collective rpc request to one child */ +static int forward_request(void* input_ptr, + coll_request* creq) +{ + int rc = UNIFYFS_SUCCESS; + + /* call rpc function */ + hg_return_t hret = margo_iforward(creq->handle, input_ptr, + &(creq->request)); + if (hret != HG_SUCCESS) { + LOGERR("failed to forward request(%p)", creq); + rc = UNIFYFS_ERROR_MARGO; + } + + return rc; +} + +/* helper method to wait for collective rpc child request completion */ +static int wait_for_request(coll_request* creq) +{ + int rc = UNIFYFS_SUCCESS; + + /* call rpc function */ + hg_return_t hret = margo_wait(creq->request); + if (hret != HG_SUCCESS) { + LOGERR("wait on request(%p) failed", creq); + rc = UNIFYFS_ERROR_MARGO; + } + + return rc; +} + +/************************************************************************* + * Broadcast file extents metadata + *************************************************************************/ + + +/* file extents metadata broadcast rpc handler */ +static void extent_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("MARGOTREE: extent bcast handler"); + + /* assume we'll succeed */ + int32_t ret = UNIFYFS_SUCCESS; + + /* get instance id */ + margo_instance_id mid = margo_hg_handle_get_instance(handle); + + /* get input params */ + extent_bcast_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* get root of tree and global file id to lookup filesize + * record tag calling process wants us to include in our + * later response */ + int gfid = (int) in.gfid; + int32_t num_extents = (int32_t) in.num_extents; + + /* allocate memory for extents */ + struct extent_tree_node* extents; + extents = calloc(num_extents, sizeof(struct extent_tree_node)); + + /* get client address */ + const struct hg_info* info = margo_get_info(handle); + hg_addr_t client_address = info->addr; + + /* expose local bulk buffer */ + hg_size_t buf_size = num_extents * sizeof(struct extent_tree_node); + hg_bulk_t extent_data; + void* datap = extents; + hret = margo_bulk_create(mid, 1, &datap, &buf_size, + HG_BULK_READWRITE, &extent_data); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + int i, rc; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_bcast_id; + + /* create communication tree structure */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + /* initiate data transfer */ + margo_request bulk_request; + hret = margo_bulk_itransfer(mid, HG_BULK_PULL, client_address, + in.extents, 0, + extent_data, 0, + buf_size, + &bulk_request); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_itransfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* update input structure to point to local bulk handle */ + in.extents = extent_data; + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = + calloc(bcast_tree.child_count, sizeof(*requests)); + if (NULL == requests) { + ret = ENOMEM; + } else { + /* allocate mercury handles for forwarding the request */ + for (i = 0; i < bcast_tree.child_count; i++) { + /* allocate handle for request to this child */ + int child = bcast_tree.child_ranks[i]; + get_request_handle(req_hgid, child, requests+i); + } + } + + /* wait for data transfer to finish */ + hret = margo_wait(bulk_request); + if (hret != HG_SUCCESS) { + LOGERR("margo_wait() for bulk transfer failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + LOGDBG("received %d extents (%zu bytes) from %d", + num_extents, (size_t)buf_size, (int)in.root); + + if (NULL != requests) { + /* forward request down the tree */ + for (i = 0; i < bcast_tree.child_count; i++) { + /* invoke filesize request rpc on child */ + rc = forward_request((void*)&in, requests+i); + } + } + + ret = unifyfs_inode_add_extents(gfid, num_extents, extents); + if (ret) { + LOGERR("add of remote extents failed (ret=%d)", ret); + // what do we do now? + } + LOGDBG("added %d extents (%zu bytes) from %d", + num_extents, (size_t)buf_size, (int)in.root); + + if (NULL != requests) { + /* wait for the requests to finish */ + coll_request* req; + for (i = 0; i < bcast_tree.child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + /* get the output of the rpc */ + extent_bcast_out_t out; + hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = (int) out.ret; + LOGDBG("MARGOTREE: extbcast child[%d] " + "response: %d", i, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + free(requests); + } + } + /* free bulk data handle */ + margo_bulk_free(extent_data); + + /* release communication tree resources */ + unifyfs_tree_free(&bcast_tree); + } + margo_free_input(handle, &in); + } + + /* build our output values */ + extent_bcast_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + LOGDBG("MARGOTREE: extent bcast rpc handler - responded"); + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(extent_bcast_rpc) + +/* Forward the extent broadcast to all children and wait for responses */ +static +int extent_bcast_forward(const unifyfs_tree_t* broadcast_tree, + extent_bcast_in_t* in) +{ + LOGDBG("MARGOTREE: extent bcast forward"); + + /* get info for tree */ + int child_count = broadcast_tree->child_count; + if (0 == child_count) { + return UNIFYFS_SUCCESS; + } + + int* child_ranks = broadcast_tree->child_ranks; + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = calloc(child_count, + sizeof(*requests)); + + /* forward request down the tree */ + int i, rc, ret; + coll_request* req; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_bcast_id; + for (i = 0; i < child_count; i++) { + req = requests + i; + + /* allocate handle */ + rc = get_request_handle(req_hgid, child_ranks[i], req); + if (rc == UNIFYFS_SUCCESS) { + /* invoke extbcast request rpc on child */ + rc = forward_request((void*)in, req); + } else { + ret = rc; + } + } + + /* wait for the requests to finish */ + for (i = 0; i < child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + LOGDBG("MARGOTREE: extent bcast - child[%d] responded", i); + /* get the output of the rpc */ + extent_bcast_out_t out; + hg_return_t hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = out.ret; + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + + return ret; +} + +/* Execute broadcast tree for extent metadata */ +int unifyfs_invoke_broadcast_extents_rpc(int gfid, unsigned int len, + struct extent_tree_node* extents) +{ + /* assuming success */ + int ret = UNIFYFS_SUCCESS; + + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + hg_size_t num_extents = len; + hg_size_t buf_size = num_extents * sizeof(*extents); + + LOGDBG("broadcasting %u extents for gfid=%d)", + len, gfid); + + /* create bulk data structure containing the extents + * NOTE: bulk data is always read only at the root of the broadcast tree */ + hg_bulk_t extents_bulk; + void* datap = (void*) extents; + hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, + &datap, &buf_size, + HG_BULK_READ_ONLY, &extents_bulk); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* fill in input struct */ + extent_bcast_in_t in; + in.root = (int32_t)glb_pmi_rank; + in.gfid = gfid; + in.num_extents = num_extents; + in.extents = extents_bulk; + + extent_bcast_forward(&bcast_tree, &in); + + /* free bulk data handle */ + margo_bulk_free(extents_bulk); + } + + /* free tree resources and passed extents */ + unifyfs_tree_free(&bcast_tree); + free(extents); + + return ret; +} + +/************************************************************************* + * Broadcast file attributes and extents metadata due to laminate + *************************************************************************/ + +/* file extents metadata broadcast rpc handler */ +static void laminate_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("MARGOTREE: laminate bcast handler"); + + int32_t ret; + + /* get instance id */ + margo_instance_id mid = margo_hg_handle_get_instance(handle); + + /* get input params */ + laminate_bcast_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* get root of tree and global file id to lookup filesize + * record tag calling process wants us to include in our + * later response */ + int gfid = (int) in.gfid; + size_t num_extents = (size_t) in.num_extents; + unifyfs_file_attr_t* fattr = &(in.attr); + + /* allocate memory for extents */ + struct extent_tree_node* extents; + extents = calloc(num_extents, sizeof(struct extent_tree_node)); + + /* get client address */ + const struct hg_info* info = margo_get_info(handle); + hg_addr_t client_address = info->addr; + + /* expose local bulk buffer */ + hg_size_t buf_size = num_extents * sizeof(struct extent_tree_node); + hg_bulk_t extent_data; + void* datap = extents; + hret = margo_bulk_create(mid, 1, &datap, &buf_size, + HG_BULK_READWRITE, &extent_data); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + int i, rc; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.laminate_bcast_id; + + /* create communication tree structure */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + /* initiate data transfer */ + margo_request bulk_request; + hret = margo_bulk_itransfer(mid, HG_BULK_PULL, + client_address, in.extents, 0, + extent_data, 0, + buf_size, &bulk_request); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_itransfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = + calloc(bcast_tree.child_count, sizeof(*requests)); + if (NULL == requests) { + ret = ENOMEM; + } else { + /* allocate mercury handles for forwarding the request */ + for (i = 0; i < bcast_tree.child_count; i++) { + /* allocate handle for request to this child */ + int child = bcast_tree.child_ranks[i]; + get_request_handle(req_hgid, child, requests+i); + } + } + + /* wait for data transfer to finish */ + hret = margo_wait(bulk_request); + if (hret != HG_SUCCESS) { + LOGERR("margo_wait() for bulk transfer failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + LOGDBG("laminating gfid=%d, received %zu extents from %d", + gfid, num_extents, (int)in.root); + + if (NULL != requests) { + /* update input structure to point to local bulk handle */ + in.extents = extent_data; + + /* forward request down the tree */ + for (i = 0; i < bcast_tree.child_count; i++) { + /* invoke filesize request rpc on child */ + rc = forward_request((void*)&in, requests+i); + } + } + + /* add the final set of extents */ + ret = unifyfs_inode_add_extents(gfid, num_extents, extents); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("laminate extents update failed (ret=%d)", ret); + } + + /* update attributes only after final extents added */ + ret = unifyfs_inode_metaset(gfid, + UNIFYFS_FILE_ATTR_OP_LAMINATE, + fattr); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("laminate attrs update failed (ret=%d)", ret); + } + + if (NULL != requests) { + /* wait for the requests to finish */ + coll_request* req; + for (i = 0; i < bcast_tree.child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + /* get the output of the rpc */ + laminate_bcast_out_t out; + hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = (int) out.ret; + LOGDBG("MARGOTREE: laminate child[%d] " + "response: %d", i, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + free(requests); + } + } + /* free bulk data handle */ + margo_bulk_free(extent_data); + + /* release communication tree resources */ + unifyfs_tree_free(&bcast_tree); + } + margo_free_input(handle, &in); + } + + /* build our output values */ + laminate_bcast_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + LOGDBG("MARGOTREE: laminate bcast handler - responded"); + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(laminate_bcast_rpc) + +/* Forward the laminate broadcast to all children and wait for responses */ +static +int laminate_bcast_forward(const unifyfs_tree_t* broadcast_tree, + laminate_bcast_in_t* in) +{ + /* get info for tree */ + int* child_ranks = broadcast_tree->child_ranks; + int child_count = broadcast_tree->child_count; + if (0 == child_count) { + return UNIFYFS_SUCCESS; + } + + int gfid = (int) in->gfid; + LOGDBG("MARGOTREE: laminate bcast forward for gfid=%d", gfid); + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = calloc(child_count, + sizeof(*requests)); + + /* forward request down the tree */ + int i, rc, ret; + coll_request* req; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.laminate_bcast_id; + for (i = 0; i < child_count; i++) { + req = requests + i; + + /* allocate handle */ + rc = get_request_handle(req_hgid, child_ranks[i], req); + if (rc == UNIFYFS_SUCCESS) { + /* invoke extbcast request rpc on child */ + rc = forward_request((void*)in, req); + } else { + ret = rc; + } + } + + /* wait for the requests to finish */ + for (i = 0; i < child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + LOGDBG("MARGOTREE: laminate bcast - child[%d] responded", i); + /* get the output of the rpc */ + laminate_bcast_out_t out; + hg_return_t hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = out.ret; + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + + return ret; +} + +/* Execute broadcast tree for attributes and extent metadata due to laminate */ +int unifyfs_invoke_broadcast_laminate(int gfid) +{ + int ret; + + LOGDBG("broadcasting laminate for gfid=%d", gfid); + + /* get attributes and extents metadata */ + unifyfs_file_attr_t attrs; + ret = unifyfs_inode_metaget(gfid, &attrs); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to get file attributes for gfid=%d", gfid); + return ret; + } + + size_t n_extents; + struct extent_tree_node* extents; + ret = unifyfs_inode_get_extents(gfid, &n_extents, &extents); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to get extents for gfid=%d", gfid); + return ret; + } + + /* create bulk data structure containing the extents + * NOTE: bulk data is always read only at the root of the broadcast tree */ + hg_size_t num_extents = n_extents; + hg_size_t buf_size = num_extents * sizeof(*extents); + hg_bulk_t extents_bulk; + void* datap = (void*) extents; + hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, + &datap, &buf_size, + HG_BULK_READ_ONLY, &extents_bulk); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* create broadcast communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + /* fill input struct and forward */ + laminate_bcast_in_t in; + in.root = (int32_t) glb_pmi_rank; + in.gfid = (int32_t) gfid; + in.attr = attrs; + in.num_extents = (int32_t) num_extents; + in.extents = extents_bulk; + laminate_bcast_forward(&bcast_tree, &in); + + /* free tree resources */ + unifyfs_tree_free(&bcast_tree); + + /* free bulk data handle */ + margo_bulk_free(extents_bulk); + } + + /* free extents array */ + free(extents); + + return ret; +} + + +/************************************************************************* + * Broadcast file truncation + *************************************************************************/ + +/* Forward the truncate broadcast to all children and wait for responses */ +static +int truncate_bcast_forward(const unifyfs_tree_t* broadcast_tree, + truncate_bcast_in_t* in) +{ + int i, rc, ret; + int gfid = (int) in->gfid; + size_t fsize = (size_t) in->filesize; + LOGDBG("MARGOTREE: truncate bcast forward - gfid=%d size=%zu", + gfid, fsize); + + /* apply truncation to local file state */ + ret = unifyfs_inode_truncate(gfid, (unsigned long)fsize); + if (ret) { + LOGERR("unifyfs_inode_truncate(gfid=%d, size=%zu) failed - ret=%d", + gfid, fsize, ret); + goto out; + } + + /* get info for tree */ + int child_count = broadcast_tree->child_count; + int* child_ranks = broadcast_tree->child_ranks; + if (child_count > 0) { + LOGDBG("MARGOTREE: sending truncate to %d children", + child_count); + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = calloc(child_count, + sizeof(coll_request)); + if (!requests) { + ret = ENOMEM; + goto out; + } + + /* forward request down the tree */ + coll_request* req; + hg_id_t hgid = unifyfsd_rpc_context->rpcs.truncate_bcast_id; + for (i = 0; i < child_count; i++) { + req = requests + i; + + /* get rank of this child */ + int child = child_ranks[i]; + LOGDBG("MARGOTREE: truncate child[%d] is rank %d - %s", + i, child, glb_servers[child].margo_svr_addr_str); + + /* allocate handle */ + rc = get_request_handle(hgid, child, req); + if (rc == UNIFYFS_SUCCESS) { + /* invoke truncate request rpc on child */ + rc = forward_request((void*)in, req); + } else { + ret = rc; + } + } + + /* wait for the requests to finish */ + for (i = 0; i < child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + /* get the output of the rpc */ + truncate_bcast_out_t out; + hg_return_t hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = out.ret; + LOGDBG("MARGOTREE: truncate child[%d] response: ret=%d", + i, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + + free(requests); + } + +out: + return ret; +} + +/* truncate broadcast rpc handler */ +static void truncate_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("MARGOTREE: truncate bcast handler"); + + /* assume we'll succeed */ + int32_t ret = UNIFYFS_SUCCESS; + + /* get input params */ + truncate_bcast_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + ret = truncate_bcast_forward(&bcast_tree, &in); + + unifyfs_tree_free(&bcast_tree); + margo_free_input(handle, &in); + } + + /* build our output values */ + truncate_bcast_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(truncate_bcast_rpc) + +/* Execute broadcast tree for file truncate */ +int unifyfs_invoke_broadcast_truncate(int gfid, size_t filesize) +{ + LOGDBG("broadcasting truncate for gfid=%d filesize=%zu", + gfid, filesize); + + /* assuming success */ + int ret = UNIFYFS_SUCCESS; + + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + /* fill in input struct */ + truncate_bcast_in_t in; + in.root = (int32_t) glb_pmi_rank; + in.gfid = gfid; + in.filesize = filesize; + + ret = truncate_bcast_forward(&bcast_tree, &in); + if (ret) { + LOGERR("truncate_bcast_forward failed: (ret=%d)", ret); + } + + unifyfs_tree_free(&bcast_tree); + + return ret; +} + +/************************************************************************* + * Broadcast updates to file attributes + *************************************************************************/ + +/* Forward the fileattr broadcast to all children and wait for responses */ +static +int fileattr_bcast_forward(const unifyfs_tree_t* broadcast_tree, + fileattr_bcast_in_t* in) +{ + int i, rc, ret; + int gfid = (int) in->gfid; + + LOGDBG("MARGOTREE: fileattr bcast forward (gfid=%d)", gfid); + + /* set local metadata for target file */ + ret = unifyfs_inode_metaset(gfid, in->attrop, &in->attr); + if (ret) { + goto out; + } + + /* get info for tree */ + int child_count = broadcast_tree->child_count; + int* child_ranks = broadcast_tree->child_ranks; + if (child_count > 0) { + LOGDBG("MARGOTREE: %d: sending metaset to %d children", + glb_pmi_rank, child_count); + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = calloc(child_count, + sizeof(coll_request)); + if (!requests) { + ret = ENOMEM; + goto out; + } + + /* forward request down the tree */ + coll_request* req; + hg_id_t hgid = unifyfsd_rpc_context->rpcs.fileattr_bcast_id; + for (i = 0; i < child_count; i++) { + req = requests + i; + + /* get rank of this child */ + int child = child_ranks[i]; + LOGDBG("MARGOTREE: metaset child[%d] is rank %d - %s", + i, child, glb_servers[child].margo_svr_addr_str); + + /* allocate handle */ + rc = get_request_handle(hgid, child, req); + if (rc == UNIFYFS_SUCCESS) { + /* invoke metaset request rpc on child */ + rc = forward_request((void*)in, req); + } else { + ret = rc; + } + } + + /* wait for the requests to finish */ + for (i = 0; i < child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + /* get the output of the rpc */ + fileattr_bcast_out_t out; + hg_return_t hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = out.ret; + LOGDBG("MARGOTREE: metaset child[%d] response: ret=%d", + i, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + + free(requests); + } +out: + return ret; +} + +/* file attributes broadcast rpc handler */ +static void fileattr_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("MARGOTREE: fileattr bcast handler"); + + /* assume we'll succeed */ + int32_t ret = UNIFYFS_SUCCESS; + + /* get input params */ + fileattr_bcast_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + ret = fileattr_bcast_forward(&bcast_tree, &in); + + unifyfs_tree_free(&bcast_tree); + margo_free_input(handle, &in); + } + + /* build our output values */ + fileattr_bcast_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(fileattr_bcast_rpc) + +/* Execute broadcast tree for file attributes update */ +int unifyfs_invoke_broadcast_fileattr(int gfid, + int attr_op, + unifyfs_file_attr_t* fattr) +{ + LOGDBG("broadcasting file attributes for gfid=%d", gfid); + + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + /* fill in input struct */ + fileattr_bcast_in_t in; + in.root = (int32_t) glb_pmi_rank; + in.gfid = gfid; + in.attrop = attr_op; + in.attr = *fattr; + + int ret = fileattr_bcast_forward(&bcast_tree, &in); + if (ret) { + LOGERR("fileattr_bcast_forward failed: (ret=%d)", ret); + } + + unifyfs_tree_free(&bcast_tree); + + return ret; +} + +/************************************************************************* + * Broadcast file unlink + *************************************************************************/ + +/* Forward the unlink broadcast to all children and wait for responses */ +static +int unlink_bcast_forward(const unifyfs_tree_t* broadcast_tree, + unlink_bcast_in_t* in) +{ + int i, rc, ret; + int gfid = (int) in->gfid; + + LOGDBG("MARGOTREE: unlink bcast forward (gfid=%d)", gfid); + + /* remove local file metadata */ + ret = unifyfs_inode_unlink(in->gfid); + if (ret) { + goto out; + } + + /* get info for tree */ + int child_count = broadcast_tree->child_count; + int* child_ranks = broadcast_tree->child_ranks; + if (child_count > 0) { + LOGDBG("MARGOTREE: %d: sending unlink to %d children", + glb_pmi_rank, child_count); + + /* allocate memory for request objects + * TODO: possibly get this from memory pool */ + coll_request* requests = calloc(child_count, + sizeof(coll_request)); + if (!requests) { + ret = ENOMEM; + goto out; + } + + /* forward request down the tree */ + coll_request* req; + hg_id_t hgid = unifyfsd_rpc_context->rpcs.unlink_bcast_id; + for (i = 0; i < child_count; i++) { + req = requests + i; + + /* get rank of this child */ + int child = child_ranks[i]; + LOGDBG("MARGOTREE: unlink child[%d] is rank %d - %s", + i, child, glb_servers[child].margo_svr_addr_str); + + /* allocate handle */ + rc = get_request_handle(hgid, child, req); + if (rc == UNIFYFS_SUCCESS) { + /* invoke unlink request rpc on child */ + rc = forward_request((void*)in, req); + } else { + ret = rc; + } + } + + /* wait for the requests to finish */ + for (i = 0; i < child_count; i++) { + req = requests + i; + rc = wait_for_request(req); + if (rc == UNIFYFS_SUCCESS) { + /* get the output of the rpc */ + unlink_bcast_out_t out; + hg_return_t hret = margo_get_output(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + int child_ret = out.ret; + LOGDBG("MARGOTREE: unlink child[%d] response: ret=%d", + i, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + margo_free_output(req->handle, &out); + } + margo_destroy(req->handle); + } else { + ret = rc; + } + } + + free(requests); + } + +out: + return ret; +} + +/* unlink broacast rpc handler */ +static void unlink_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("MARGOTREE: unlink bcast handler"); + + int32_t ret; + + /* get input params */ + unlink_bcast_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + ret = unlink_bcast_forward(&bcast_tree, &in); + + unifyfs_tree_free(&bcast_tree); + margo_free_input(handle, &in); + } + + /* build our output values */ + unlink_bcast_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(unlink_bcast_rpc) + +/* Execute broadcast tree for file unlink */ +int unifyfs_invoke_broadcast_unlink(int gfid) +{ + LOGDBG("broadcasting unlink for gfid=%d", gfid); + + /* create communication tree */ + unifyfs_tree_t bcast_tree; + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, + UNIFYFS_BCAST_K_ARY, &bcast_tree); + + /* fill in input struct */ + unlink_bcast_in_t in; + in.root = (int32_t) glb_pmi_rank; + in.gfid = (int32_t) gfid; + + int ret = unlink_bcast_forward(&bcast_tree, &in); + if (ret) { + LOGERR("unlink_bcast_forward failed: (ret=%d)", ret); + } + + unifyfs_tree_free(&bcast_tree); + + return ret; +} diff --git a/server/src/unifyfs_group_rpc.h b/server/src/unifyfs_group_rpc.h new file mode 100644 index 000000000..80db9f0d9 --- /dev/null +++ b/server/src/unifyfs_group_rpc.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef _UNIFYFS_GROUP_RPC_H +#define _UNIFYFS_GROUP_RPC_H + +#include "unifyfs_tree.h" +#include "unifyfs_inode.h" + +/* Collective Server RPCs */ + +/** + * @brief Broadcast file extents metadata to all servers + * + * @param gfid target file + * @param len length of file extents array + * @param extents array of extents to broadcast + * + * @return success|failure + */ +int unifyfs_invoke_broadcast_extents(int gfid, + unsigned int len, + struct extent_tree_node* extents); + +/** + * @brief Broadcast file attributes metadata to all servers + * + * @param gfid target file + * @param fileop file operation that triggered metadata update + * @param attr file attributes + * + * @return success|failure + */ +int unifyfs_invoke_broadcast_fileattr(int gfid, + int fileop, + unifyfs_file_attr_t* attr); + +/** + * @brief Broadcast file attributes and extent metadata to all servers + * + * @param gfid target file + * + * @return success|failure + */ +int unifyfs_invoke_broadcast_laminate(int gfid); + +/** + * @brief Truncate target file at all servers + * + * @param gfid target file + * @param filesize truncated file size + * + * @return success|failure + */ +int unifyfs_invoke_broadcast_truncate(int gfid, size_t filesize); + +/** + * @brief Unlink file at all servers + * + * @param gfid target file + * + * @return success|failure + */ +int unifyfs_invoke_broadcast_unlink(int gfid); + + +#endif // UNIFYFS_GROUP_RPC_H diff --git a/server/src/unifyfs_init.c b/server/src/unifyfs_init.c deleted file mode 100644 index 4781d341f..000000000 --- a/server/src/unifyfs_init.c +++ /dev/null @@ -1,676 +0,0 @@ -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2017, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -// system headers -#include -#include -#include - -// common headers -#include "unifyfs_configurator.h" -#include "unifyfs_keyval.h" -#include "unifyfs_runstate.h" - -// server components -#include "unifyfs_global.h" -#include "unifyfs_metadata.h" -#include "unifyfs_request_manager.h" -#include "unifyfs_service_manager.h" - -// margo rpcs -#include "margo_server.h" - -int glb_mpi_rank, glb_mpi_size; -char glb_host[UNIFYFS_MAX_HOSTNAME]; - -size_t glb_num_servers; // size of glb_servers array -server_info_t* glb_servers; // array of server_info_t - -arraylist_t* app_config_list; - -int invert_sock_ids[MAX_NUM_CLIENTS]; /*records app_id for each sock_id*/ - -unifyfs_cfg_t server_cfg; - -static int unifyfs_exit(void); - -#if defined(UNIFYFS_MULTIPLE_DELEGATORS) -int* local_rank_lst; -int local_rank_cnt; - -/* - * structure that records the information of - * each application launched by srun. - * */ -typedef struct { - char hostname[UNIFYFS_MAX_HOSTNAME]; - int rank; -} name_rank_pair_t; - -static int compare_name_rank_pair(const void* a, const void* b); -static int compare_int(const void* a, const void* b); -static int CountTasksPerNode(int rank, int numTasks); -static int find_rank_idx(int my_rank); -#endif - -/* - * Perform steps to create a daemon process: - * - * 1. Fork and exit from parent so child runs in the background - * 2. Set the daemon umask to 0 so file modes passed to open() and - * mkdir() fully control access modes - * 3. Call setsid() to create a new session and detach from controlling tty - * 4. Change current working directory to / so daemon doesn't block - * filesystem unmounts - * 5. close STDIN, STDOUT, and STDERR - * 6. Fork again to abdicate session leader position to guarantee - * daemon cannot reacquire a controlling TTY - * - */ -static void daemonize(void) -{ - pid_t pid; - pid_t sid; - int rc; - - pid = fork(); - - if (pid < 0) { - LOGERR("fork failed: %s", strerror(errno)); - exit(1); - } - - if (pid > 0) { - exit(0); - } - - umask(0); - - sid = setsid(); - if (sid < 0) { - LOGERR("setsid failed: %s", strerror(errno)); - exit(1); - } - - rc = chdir("/"); - if (rc < 0) { - LOGERR("chdir failed: %s", strerror(errno)); - exit(1); - } - - close(STDIN_FILENO); - close(STDOUT_FILENO); - close(STDERR_FILENO); - - pid = fork(); - if (pid < 0) { - LOGERR("fork failed: %s", strerror(errno)); - exit(1); - } else if (pid > 0) { - exit(0); - } -} - -static int time_to_exit; -void exit_request(int sig) -{ -#ifdef HAVE_STRSIGNAL - const char* sigstr = strsignal(sig); - LOGDBG("got signal %s", sigstr); -#endif - - switch (sig) { - case SIGINT: - case SIGQUIT: - case SIGTERM: - time_to_exit = 1; - LOGDBG("exit requested"); - break; - default: - LOGERR("unhandled signal %d", sig); - break; - } -} - -static int allocate_servers(size_t n_servers) -{ - glb_num_servers = n_servers; - glb_servers = (server_info_t*) calloc(n_servers, sizeof(server_info_t)); - if (NULL == glb_servers) { - LOGERR("failed to allocate server_info array"); - return (int)UNIFYFS_ERROR_NOMEM; - } - return (int)UNIFYFS_SUCCESS; -} - -static int process_servers_hostfile(const char* hostfile) -{ - int rc; - size_t i, cnt; - FILE* fp = NULL; - char hostbuf[UNIFYFS_MAX_HOSTNAME+1]; - - if (NULL == hostfile) { - return (int)UNIFYFS_ERROR_INVAL; - } - fp = fopen(hostfile, "r"); - if (!fp) { - LOGERR("failed to open hostfile %s", hostfile); - return (int)UNIFYFS_FAILURE; - } - - // scan first line: number of hosts - rc = fscanf(fp, "%zu\n", &cnt); - if (1 != rc) { - LOGERR("failed to scan hostfile host count"); - fclose(fp); - return (int)UNIFYFS_FAILURE; - } - rc = allocate_servers(cnt); - if ((int)UNIFYFS_SUCCESS != rc) { - fclose(fp); - return (int)UNIFYFS_FAILURE; - } - - // scan host lines - for (i = 0; i < cnt; i++) { - memset(hostbuf, 0, sizeof(hostbuf)); - rc = fscanf(fp, "%s\n", hostbuf); - if (1 != rc) { - LOGERR("failed to scan hostfile host line %zu", i); - fclose(fp); - return (int)UNIFYFS_FAILURE; - } - //glb_servers[i].hostname = strdup(hostbuf); - // NOTE: following assumes one server per host - if (0 == strcmp(glb_host, hostbuf)) { - //glb_svr_rank = (int)i; - LOGDBG("found myself at hostfile index=%zu, mpi_rank=%d", - i, glb_mpi_rank); - } - } - fclose(fp); - - return (int)UNIFYFS_SUCCESS; -} - -int main(int argc, char* argv[]) -{ - int provided; - int rc; - int kv_rank, kv_nranks; - bool daemon = true; - struct sigaction sa; - char rank_str[16] = {0}; - char dbg_fname[UNIFYFS_MAX_FILENAME] = {0}; - - rc = unifyfs_config_init(&server_cfg, argc, argv); - if (rc != 0) { - exit(1); - } - server_cfg.ptype = UNIFYFS_SERVER; - - rc = configurator_bool_val(server_cfg.unifyfs_daemonize, &daemon); - if (rc != 0) { - exit(1); - } - if (daemon) { - daemonize(); - } - - // setup clean termination by signal - memset(&sa, 0, sizeof(struct sigaction)); - sa.sa_handler = exit_request; - rc = sigemptyset(&sa.sa_mask); - rc = sigaction(SIGINT, &sa, NULL); - rc = sigaction(SIGQUIT, &sa, NULL); - rc = sigaction(SIGTERM, &sa, NULL); - - app_config_list = arraylist_create(); - if (app_config_list == NULL) { - LOGERR("%s", unifyfs_error_enum_description(UNIFYFS_ERROR_NOMEM)); - exit(1); - } - - rm_thrd_list = arraylist_create(); - if (rm_thrd_list == NULL) { - LOGERR("%s", unifyfs_error_enum_description(UNIFYFS_ERROR_NOMEM)); - exit(1); - } - - rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = MPI_Comm_rank(MPI_COMM_WORLD, &glb_mpi_rank); - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = MPI_Comm_size(MPI_COMM_WORLD, &glb_mpi_size); - if (rc != MPI_SUCCESS) { - exit(1); - } - - // start logging - gethostname(glb_host, sizeof(glb_host)); - snprintf(dbg_fname, sizeof(dbg_fname), "%s/%s.%s.%d", - server_cfg.log_dir, server_cfg.log_file, glb_host, glb_mpi_rank); - rc = unifyfs_log_open(dbg_fname); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("%s", unifyfs_error_enum_description((unifyfs_error_e)rc)); - } - - if (NULL != server_cfg.server_hostfile) { - rc = process_servers_hostfile(server_cfg.server_hostfile); - if (rc != (int)UNIFYFS_SUCCESS) { - LOGERR("failed to gather server information"); - exit(1); - } - } - - kv_rank = glb_mpi_rank; - kv_nranks = glb_mpi_size; - rc = unifyfs_keyval_init(&server_cfg, &kv_rank, &kv_nranks); - if (rc != (int)UNIFYFS_SUCCESS) { - exit(1); - } - if (glb_mpi_rank != kv_rank) { - LOGDBG("mismatch on MPI (%d) vs kvstore (%d) rank", - glb_mpi_rank, kv_rank); - } - if (glb_mpi_size != kv_nranks) { - LOGDBG("mismatch on MPI (%d) vs kvstore (%d) num ranks", - glb_mpi_size, kv_nranks); - } - - // TEMPORARY: remove once we fully eliminate use of MPI in sever - snprintf(rank_str, sizeof(rank_str), "%d", glb_mpi_rank); - rc = unifyfs_keyval_publish_remote(key_unifyfsd_mpi_rank, rank_str); - if (rc != (int)UNIFYFS_SUCCESS) { - exit(1); - } - - if (NULL == server_cfg.server_hostfile) { - //glb_svr_rank = kv_rank; - rc = allocate_servers((size_t)kv_nranks); - } - - rc = unifyfs_write_runstate(&server_cfg); - if (rc != (int)UNIFYFS_SUCCESS) { - exit(1); - } - - LOGDBG("initializing rpc service"); - rc = configurator_bool_val(server_cfg.margo_tcp, &margo_use_tcp); - rc = margo_server_rpc_init(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("%s", unifyfs_error_enum_description(UNIFYFS_ERROR_MARGO)); - exit(1); - } - - MPI_Barrier(MPI_COMM_WORLD); - - LOGDBG("connecting rpc servers"); - rc = margo_connect_servers(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("%s", unifyfs_error_enum_description(UNIFYFS_ERROR_MARGO)); - exit(1); - } - -#if defined(UNIFYFS_USE_DOMAIN_SOCKET) - int srvr_rank_idx = 0; -#if defined(UNIFYFS_MULTIPLE_DELEGATORS) - rc = CountTasksPerNode(glb_mpi_rank, glb_mpi_size); - if (rc < 0) { - exit(1); - } - srvr_rank_idx = find_rank_idx(glb_mpi_rank); -#endif // UNIFYFS_MULTIPLE_DELEGATORS - LOGDBG("creating server domain socket"); - rc = sock_init_server(srvr_rank_idx); - if (rc != 0) { - LOGERR("%s", unifyfs_error_enum_description(UNIFYFS_ERROR_SOCKET)); - exit(1); - } -#endif // UNIFYFS_USE_DOMAIN_SOCKET - - /* launch the service manager */ - LOGDBG("launching service manager thread"); - rc = svcmgr_init(); - if (rc != (int)UNIFYFS_SUCCESS) { - LOGERR("launch failed - %s", unifyfs_error_enum_description(rc)); - exit(1); - } - - LOGDBG("initializing metadata store"); - rc = meta_init_store(&server_cfg); - if (rc != 0) { - LOGERR("%s", unifyfs_error_enum_description(UNIFYFS_ERROR_MDINIT)); - exit(1); - } - - MPI_Barrier(MPI_COMM_WORLD); - LOGDBG("finished service initialization"); - - while (1) { -#if defined(UNIFYFS_USE_DOMAIN_SOCKET) - int timeout_ms = 2000; /* in milliseconds */ - rc = sock_wait_cmd(timeout_ms); - if (rc != UNIFYFS_SUCCESS) { - // we ignore disconnects, they are expected - if (rc != UNIFYFS_ERROR_SOCK_DISCONNECT) { - LOGDBG("domain socket error %s", - unifyfs_error_enum_description((unifyfs_error_e)rc)); - time_to_exit = 1; - } - } -#else - sleep(1); -#endif // UNIFYFS_USE_DOMAIN_SOCKET - if (time_to_exit) { - LOGDBG("starting service shutdown"); - break; - } - } - - LOGDBG("stopping service manager thread"); - rc = svcmgr_fini(); - - LOGDBG("cleaning run state"); - rc = unifyfs_clean_runstate(&server_cfg); - - return unifyfs_exit(); -} - -#if defined(UNIFYFS_MULTIPLE_DELEGATORS) -/* count the number of delegators per node, and - * the rank of each delegator, the results are stored - * in local_rank_cnt and local_rank_lst. - * @param numTasks: number of processes in the communicator - * @return success/error code */ -static int CountTasksPerNode(int rank, int numTasks) -{ - char localhost[UNIFYFS_MAX_HOSTNAME]; - char hostname[UNIFYFS_MAX_HOSTNAME]; - int resultsLen = UNIFYFS_MAX_HOSTNAME; - - MPI_Status status; - int i, j, rc; - - if (numTasks < 0) { - return -1; - } - - rc = MPI_Get_processor_name(localhost, &resultsLen); - if (rc != 0) { - return -1; - } - - if (rank == 0) { - /* a container of (rank, host) mappings */ - name_rank_pair_t* host_set = - (name_rank_pair_t*)calloc(numTasks, sizeof(name_rank_pair_t)); - /* MPI_Recv all hostnames, and compare to local hostname */ - for (i = 1; i < numTasks; i++) { - rc = MPI_Recv(hostname, UNIFYFS_MAX_HOSTNAME, - MPI_CHAR, MPI_ANY_SOURCE, - MPI_ANY_TAG, - MPI_COMM_WORLD, &status); - if (rc != 0) { - return -1; - } - strcpy(host_set[i].hostname, hostname); - host_set[i].rank = status.MPI_SOURCE; - } - strcpy(host_set[0].hostname, localhost); - host_set[0].rank = 0; - - /* sort by hostname */ - qsort(host_set, numTasks, sizeof(name_rank_pair_t), - compare_name_rank_pair); - - /* rank_cnt: records the number of processes on each host - * rank_set: the list of ranks for each host */ - int** rank_set = (int**)calloc(numTasks, sizeof(int*)); - int* rank_cnt = (int*)calloc(numTasks, sizeof(int)); - - int cursor = 0; - int set_counter = 0; - for (i = 1; i < numTasks; i++) { - if (strcmp(host_set[i].hostname, - host_set[i - 1].hostname) != 0) { - // found a different host, so switch to a new set - int hiter, riter = 0; - rank_set[set_counter] = - (int*)calloc((i - cursor), sizeof(int)); - rank_cnt[set_counter] = i - cursor; - for (hiter = cursor; hiter < i; hiter++, riter++) { - rank_set[set_counter][riter] = host_set[hiter].rank; - } - - set_counter++; - cursor = i; - } - } - - /* fill rank_cnt and rank_set entry for the last host */ - - rank_set[set_counter] = - (int*)calloc((i - cursor), sizeof(int)); - rank_cnt[set_counter] = numTasks - cursor; - j = 0; - for (i = cursor; i < numTasks; i++, j++) { - rank_set[set_counter][j] = host_set[i].rank; - } - set_counter++; - - /* broadcast rank_set information */ - int root_set_no = -1; - for (i = 0; i < set_counter; i++) { - /* send rank set to each of its ranks */ - for (j = 0; j < rank_cnt[i]; j++) { - if (rank_set[i][j] != 0) { - rc = MPI_Send(&rank_cnt[i], 1, MPI_INT, - rank_set[i][j], 0, MPI_COMM_WORLD); - if (rc != 0) { - return -1; - } - rc = MPI_Send(rank_set[i], rank_cnt[i], MPI_INT, - rank_set[i][j], 0, MPI_COMM_WORLD); - if (rc != 0) { - return -1; - } - } else { - root_set_no = i; - local_rank_cnt = rank_cnt[i]; - local_rank_lst = (int*)calloc(rank_cnt[i], sizeof(int)); - memcpy(local_rank_lst, rank_set[i], - (local_rank_cnt * sizeof(int))) - } - } - } - - for (i = 0; i < set_counter; i++) { - free(rank_set[i]); - } - free(rank_cnt); - free(host_set); - free(rank_set); - } else { /* non-root rank */ - /* MPI_Send hostname to root */ - rc = MPI_Send(localhost, UNIFYFS_MAX_HOSTNAME, MPI_CHAR, - 0, 0, MPI_COMM_WORLD); - if (rc != 0) { - return -1; - } - /* receive the local rank set count */ - rc = MPI_Recv(&local_rank_cnt, 1, MPI_INT, 0, - 0, MPI_COMM_WORLD, &status); - if (rc != 0) { - return -1; - } - /* receive the the local rank set */ - local_rank_lst = (int*)calloc(local_rank_cnt, sizeof(int)); - rc = MPI_Recv(local_rank_lst, local_rank_cnt, MPI_INT, 0, - 0, MPI_COMM_WORLD, &status); - if (rc != 0) { - free(local_rank_lst); - return -1; - } - } - - /* sort by rank */ - qsort(local_rank_lst, local_rank_cnt, sizeof(int), compare_int); - - return 0; -} - -static int find_rank_idx(int my_rank) -{ - int i; - assert(local_rank_lst != NULL); - for (i = 0; i < local_rank_cnt; i++) { - if (local_rank_lst[i] == my_rank) { - return i; - } - } - return -1; -} - -static int compare_name_rank_pair(const void* a, const void* b) -{ - const name_rank_pair_t* pair_a = a; - const name_rank_pair_t* pair_b = b; - return strcmp(pair_a->hostname, pair_b->hostname); -} - -static int compare_int(const void* a, const void* b) -{ - int aval = *(const int*)a; - int bval = *(const int*)b; - return aval - bval; -} - -#endif // UNIFYFS_MULTIPLE_DELEGATORS - - -static int unifyfs_exit(void) -{ - int rc = UNIFYFS_SUCCESS; - - /* shutdown rpc service */ - LOGDBG("stopping rpc service"); - margo_server_rpc_finalize(); - -#if defined(UNIFYFS_USE_DOMAIN_SOCKET) - /* close remaining sockets */ - LOGDBG("closing sockets"); - sock_sanitize(); -#endif - - /* finalize kvstore service*/ - LOGDBG("finalizing kvstore service"); - unifyfs_keyval_fini(); - - /* TODO: notify the service threads to exit */ - - /* notify the request manager threads to exit*/ - LOGDBG("stopping request manager threads"); - int i, j; - for (i = 0; i < arraylist_size(rm_thrd_list); i++) { - /* request and wait for request manager thread exit */ - reqmgr_thrd_t* thrd_ctrl = - (reqmgr_thrd_t*) arraylist_get(rm_thrd_list, i); - rm_cmd_exit(thrd_ctrl); - } - arraylist_free(rm_thrd_list); - - /* sanitize the shared memory and delete the log files - * */ - int app_sz = arraylist_size(app_config_list); - - /* iterate over each active application and free resources */ - for (i = 0; i < app_sz; i++) { - /* get pointer to app config for this app_id */ - app_config_t* app = - (app_config_t*)arraylist_get(app_config_list, i); - - /* skip to next app_id if this is empty */ - if (app == NULL) { - continue; - } - - /* free resources allocate for each client */ - for (j = 0; j < MAX_NUM_CLIENTS; j++) { - /* release request buffer shared memory region */ - if (app->shm_req_bufs[j] != NULL) { - unifyfs_shm_free(app->req_buf_name[j], app->req_buf_sz, - (void**)&(app->shm_req_bufs[j])); - } - - /* release receive buffer shared memory region */ - if (app->shm_recv_bufs[j] != NULL) { - unifyfs_shm_free(app->recv_buf_name[j], app->recv_buf_sz, - (void**)&(app->shm_recv_bufs[j])); - } - - /* release super block shared memory region */ - if (app->shm_superblocks[j] != NULL) { - unifyfs_shm_free(app->super_buf_name[j], app->superblock_sz, - (void**)&(app->shm_superblocks[j])); - } - - /* close spill log file and delete it */ - if (app->spill_log_fds[j] > 0) { - close(app->spill_log_fds[j]); - unlink(app->spill_log_name[j]); - } - - /* close spill log index file and delete it */ - if (app->spill_index_log_fds[j] > 0) { - close(app->spill_index_log_fds[j]); - unlink(app->spill_index_log_name[j]); - } - } - } - - /* shutdown the metadata service*/ - LOGDBG("stopping metadata service"); - meta_sanitize(); - - LOGDBG("finalizing MPI"); - MPI_Finalize(); - - LOGDBG("all done!"); - unifyfs_log_close(); - - return rc; -} diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c new file mode 100644 index 000000000..dc1e58230 --- /dev/null +++ b/server/src/unifyfs_inode.c @@ -0,0 +1,664 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include + +#include "unifyfs_inode.h" +#include "unifyfs_inode_tree.h" + +struct unifyfs_inode_tree _global_inode_tree; +struct unifyfs_inode_tree* global_inode_tree = &_global_inode_tree; + +static inline +struct unifyfs_inode* unifyfs_inode_alloc(int gfid, unifyfs_file_attr_t* attr) +{ + struct unifyfs_inode* ino = calloc(1, sizeof(*ino)); + + if (ino) { + ino->gfid = gfid; + ino->attr = *attr; + ino->attr.filename = strdup(attr->filename); + pthread_rwlock_init(&ino->rwlock, NULL); + ABT_mutex_create(&(ino->abt_sync)); + } + + return ino; +} + +static inline +int unifyfs_inode_destroy(struct unifyfs_inode* ino) +{ + int ret = UNIFYFS_SUCCESS; + + if (ino) { + if (NULL != ino->attr.filename) { + free(ino->attr.filename); + } + + if (NULL != ino->extents) { + extent_tree_destroy(ino->extents); + free(ino->extents); + } + + pthread_rwlock_destroy(&ino->rwlock); + free(ino); + } else { + ret = EINVAL; + } + + return ret; +} + +/** + * @brief read lock the inode for ro access. + * + * @param ino inode structure to get access + * + * @return 0 on success, errno otherwise + */ +static inline +int unifyfs_inode_rdlock(struct unifyfs_inode* ino) +{ + return pthread_rwlock_rdlock(&ino->rwlock); +} + +/** + * @brief write lock the inode for w+r access. + * + * @param ino inode structure to get access + * + * @return 0 on success, errno otherwise + */ +static inline +int unifyfs_inode_wrlock(struct unifyfs_inode* ino) +{ + return pthread_rwlock_wrlock(&ino->rwlock); +} + +/** + * @brief unlock the inode. + * + * @param ino inode structure to unlock + */ +static inline +void unifyfs_inode_unlock(struct unifyfs_inode* ino) +{ + pthread_rwlock_unlock(&ino->rwlock); +} + +int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + if (!attr) { + return EINVAL; + } + + ino = unifyfs_inode_alloc(gfid, attr); + + unifyfs_inode_tree_wrlock(global_inode_tree); + { + ret = unifyfs_inode_tree_insert(global_inode_tree, ino); + } + unifyfs_inode_tree_unlock(global_inode_tree); + + if (ret) { + free(ino); + } + + return ret; +} + +int unifyfs_inode_update_attr(int gfid, int attr_op, + unifyfs_file_attr_t* attr) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + if (!attr) { + return EINVAL; + } + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_wrlock(ino); + { + unifyfs_file_attr_update(attr_op, &ino->attr, attr); + } + unifyfs_inode_unlock(ino); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_metaset(int gfid, int attr_op, + unifyfs_file_attr_t* attr) +{ + int ret; + + if (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) { + ret = unifyfs_inode_create(gfid, attr); + } else { + ret = unifyfs_inode_update_attr(gfid, attr_op, attr); + } + + return ret; +} + +int unifyfs_inode_metaget(int gfid, unifyfs_file_attr_t* attr) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + if (!global_inode_tree || !attr) { + return EINVAL; + } + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (ino) { + *attr = ino->attr; + } else { + ret = ENOENT; + } + } + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_unlink(int gfid) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + unifyfs_inode_tree_wrlock(global_inode_tree); + { + ret = unifyfs_inode_tree_remove(global_inode_tree, gfid, &ino); + } + unifyfs_inode_tree_unlock(global_inode_tree); + + if (ret) { + goto out; + } + + ret = unifyfs_inode_destroy(ino); +out: + return ret; +} + +int unifyfs_inode_truncate(int gfid, unsigned long size) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_rdlock(ino); + { + if (ino->attr.is_laminated) { + LOGERR("cannot truncate a laminated file (gfid=%d)", gfid); + ret = EINVAL; + goto unlock_inode; + } + ino->attr.size = size; + + if (NULL != ino->extents) { + ret = extent_tree_truncate(ino->extents, size); + } + } +unlock_inode: + unifyfs_inode_unlock(ino); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +static struct extent_tree* inode_get_extent_tree(struct unifyfs_inode* ino) +{ + struct extent_tree* tree = ino->extents; + + /* create one if it doesn't exist yet */ + if (!tree) { + tree = calloc(1, sizeof(*tree)); + + if (!tree) { + LOGERR("failed to allocate memory for extent tree"); + return NULL; + } + + extent_tree_init(tree); + + ino->extents = tree; + } + + return tree; +} + +int unifyfs_inode_add_extents(int gfid, int num_extents, + struct extent_tree_node* nodes) +{ + int ret = UNIFYFS_SUCCESS; + int i = 0; + struct unifyfs_inode* ino = NULL; + struct extent_tree* tree = NULL; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + if (ino->attr.is_laminated) { + LOGERR("trying to add extents to a laminated file (gfid=%d)", + gfid); + ret = EINVAL; + goto out_unlock_tree; + } + + tree = inode_get_extent_tree(ino); + if (!tree) { /* failed to create one */ + ret = ENOMEM; + goto out_unlock_tree; + } + + for (i = 0; i < num_extents; i++) { + struct extent_tree_node* current = &nodes[i]; + + /* the output becomes too noisy with this: + * LOGDBG("new extent[%4d]: (%lu, %lu)", + * i, current->start, current->end); + */ + + ABT_mutex_lock(ino->abt_sync); + ret = extent_tree_add(tree, current->start, current->end, + current->svr_rank, current->app_id, + current->cli_id, current->pos); + ABT_mutex_unlock(ino->abt_sync); + if (ret) { + LOGERR("failed to add extents"); + goto out_unlock_tree; + } + } + + /* if the extent tree max offset is greater than the size we + * we currently have in the inode attributes, then update the + * inode size */ + unsigned long extent_sz = extent_tree_max_offset(ino->extents) + 1; + if ((uint64_t)extent_sz > ino->attr.size) { + unifyfs_inode_wrlock(ino); + ino->attr.size = extent_sz; + unifyfs_inode_unlock(ino); + } + + LOGINFO("added %d extents to inode (gfid=%d, filesize=%" PRIu64 ")", + num_extents, gfid, ino->attr.size); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_get_filesize(int gfid, size_t* offset) +{ + int ret = UNIFYFS_SUCCESS; + size_t filesize = 0; + struct unifyfs_inode* ino = NULL; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_rdlock(ino); + { + /* the size is updated each time we add extents or truncate, + * so no need to recalculate */ + filesize = ino->attr.size; + } + unifyfs_inode_unlock(ino); + + *offset = filesize; + + LOGDBG("local file size (gfid=%d): %lu", gfid, filesize); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_laminate(int gfid) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_wrlock(ino); + { + ino->attr.is_laminated = 1; + } + unifyfs_inode_unlock(ino); + + LOGDBG("file laminated (gfid=%d)", gfid); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_get_extents(int gfid, size_t* n, + struct extent_tree_node** nodes) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + if (!n || !nodes) { + return EINVAL; + } + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_rdlock(ino); + { + int i = 0; + struct extent_tree* tree = ino->extents; + size_t n_nodes = tree->count; + struct extent_tree_node* _nodes = calloc(n_nodes, sizeof(*_nodes)); + struct extent_tree_node* current = NULL; + + if (!_nodes) { + ret = ENOMEM; + goto out_unlock_inode; + } + + while (NULL != (current = extent_tree_iter(tree, current))) { + _nodes[i] = *current; + i++; + } + + *n = n_nodes; + *nodes = _nodes; + } +out_unlock_inode: + unifyfs_inode_unlock(ino); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, + unsigned int* n_chunks, + chunk_read_req_t** chunks) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + int gfid = extent->gfid; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_rdlock(ino); + { + if (NULL != ino->extents) { + unsigned long offset = extent->offset; + unsigned long len = extent->length; + ret = extent_tree_get_chunk_list(ino->extents, offset, len, + n_chunks, chunks); + if (ret) { + LOGERR("failed to get chunks for gfid:%d, ret=%d", + gfid, ret); + } + } + } + unifyfs_inode_unlock(ino); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + if (ret == UNIFYFS_SUCCESS) { + /* extent_tree_get_chunk_list does not populate the gfid field */ + for (unsigned int i = 0; i < *n_chunks; i++) { + (*chunks)[i].gfid = gfid; + } + } else { + *n_chunks = 0; + *chunks = NULL; + } + + return ret; +} + +static +int compare_chunk_read_reqs(const void* _c1, const void* _c2) +{ + chunk_read_req_t* c1 = (chunk_read_req_t*) _c1; + chunk_read_req_t* c2 = (chunk_read_req_t*) _c2; + + if (c1->rank > c2->rank) { + return 1; + } else if (c1->rank < c2->rank) { + return -1; + } else { + return 0; + } +} + + +int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, + unifyfs_inode_extent_t* extents, + unsigned int* n_locs, + chunk_read_req_t** chunklocs) +{ + int ret = UNIFYFS_SUCCESS; + unsigned int i = 0; + unsigned int j = 0; + unsigned int n_chunks = 0; + chunk_read_req_t* chunks = NULL; + unsigned int* n_resolved = NULL; + chunk_read_req_t** resolved = NULL; + + void* buf = calloc(n_extents, (sizeof(*n_resolved) + sizeof(*resolved))); + if (NULL == buf) { + LOGERR("failed to allocate memory"); + ret = ENOMEM; + goto out_fail; + } + + n_resolved = (unsigned int*) buf; + resolved = (chunk_read_req_t**) &n_resolved[n_extents]; + + /* resolve chunks addresses for all requests from inode tree */ + for (i = 0; i < n_extents; i++) { + unifyfs_inode_extent_t* current = &extents[i]; + + LOGDBG("resolving chunk request (gfid=%d, offset=%lu, length=%lu)", + current->gfid, current->offset, current->length); + + ret = unifyfs_inode_get_extent_chunks(current, + &n_resolved[i], &resolved[i]); + if (ret) { + LOGERR("failed to resolve the chunk request for chunk " + "[gfid=%d, offset=%lu, length=%zu] (ret=%d)", + current->gfid, current->offset, current->length, ret); + goto out_fail; + } + + n_chunks += n_resolved[i]; + } + + LOGDBG("resolved %d chunks for read request", n_chunks); + if (n_chunks > 0) { + /* store all chunks in a flat array */ + chunks = calloc(n_chunks, sizeof(*chunks)); + if (!chunks) { + LOGERR("failed to allocate memory for storing resolved chunks"); + ret = ENOMEM; + goto out_fail; + } + + chunk_read_req_t* pos = chunks; + for (i = 0; i < n_extents; i++) { + for (j = 0; j < n_resolved[i]; j++) { + *pos = resolved[i][j]; + pos++; + } + if (resolved[i]) { + free(resolved[i]); + } + } + + /* sort the requests based on server rank */ + qsort(chunks, n_chunks, sizeof(*chunks), compare_chunk_read_reqs); + + chunk_read_req_t* chk = chunks; + for (i = 0; i < n_chunks; i++, chk++) { + LOGDBG(" [%d] (offset=%lu, nbytes=%lu) @ (%d log(%d:%d:%lu))", + i, chk->offset, chk->nbytes, chk->rank, + chk->log_client_id, chk->log_app_id, chk->log_offset); + } + } + + *n_locs = n_chunks; + *chunklocs = chunks; + +out_fail: + if (ret != UNIFYFS_SUCCESS) { + if (chunks) { + free(chunks); + chunks = NULL; + } + } + + if (NULL != buf) { + free(buf); + } + + return ret; +} + +int unifyfs_inode_span_extents( + int gfid, /* global file id we're looking in */ + unsigned long start, /* starting logical offset */ + unsigned long end, /* ending logical offset */ + int max, /* maximum number of key/vals to return */ + void* keys, /* array of length max for output keys */ + void* vals, /* array of length max for output values */ + int* outnum) /* number of entries returned */ +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_rdlock(ino); + { + ret = extent_tree_span(ino->extents, gfid, start, end, + max, keys, vals, outnum); + if (ret) { + LOGERR("extent_tree_span failed (gfid=%d, ret=%d)", + gfid, ret); + } + } + unifyfs_inode_unlock(ino); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} + +int unifyfs_inode_dump(int gfid) +{ + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + + unifyfs_inode_tree_rdlock(global_inode_tree); + { + ino = unifyfs_inode_tree_search(global_inode_tree, gfid); + if (!ino) { + ret = ENOENT; + goto out_unlock_tree; + } + + unifyfs_inode_rdlock(ino); + { + LOGDBG("== inode (gfid=%d) ==\n", ino->gfid); + if (NULL != ino->extents) { + LOGDBG("extents:"); + extent_tree_dump(ino->extents); + } + } + unifyfs_inode_unlock(ino); + } +out_unlock_tree: + unifyfs_inode_tree_unlock(global_inode_tree); + + return ret; +} diff --git a/server/src/unifyfs_inode.h b/server/src/unifyfs_inode.h new file mode 100644 index 000000000..915fa6a6c --- /dev/null +++ b/server/src/unifyfs_inode.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __UNIFYFS_INODE_H +#define __UNIFYFS_INODE_H + +#include +#include "tree.h" +#include "extent_tree.h" +#include "unifyfs_meta.h" +#include "unifyfs_global.h" + +/** + * @brief file extent descriptor + */ +struct unifyfs_inode_extent { + int gfid; + unsigned long offset; + unsigned long length; +}; +typedef struct unifyfs_inode_extent unifyfs_inode_extent_t; + +/** + * @brief file and directory inode structure. this holds: + */ +struct unifyfs_inode { + /* tree entry for global inode tree */ + RB_ENTRY(unifyfs_inode) inode_tree_entry; + + int gfid; /* global file identifier */ + unifyfs_file_attr_t attr; /* file attributes */ + struct extent_tree* extents; /* extent information */ + + pthread_rwlock_t rwlock; /* rwlock for pthread access */ + ABT_mutex abt_sync; /* mutex for argobots ULT access */ +}; + +/** + * @brief create a new inode with given parameters. The newly created inode + * will be inserted to the global inode tree (global_inode_tree). + * + * @param gfid global file identifier. + * @param attr attributes of the new file. + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr); + +/** + * @brief update the attributes of file with @gfid. The attributes are + * selectively updated with unifyfs_file_attr_update() function (see + * common/unifyfs_meta.h). + * + * @param gfid global file identifier + * @param attr new attributes + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_update_attr(int gfid, int attr_op, + unifyfs_file_attr_t* attr); + +/** + * @brief create a new or update an existing inode. + * + * @param gfid global file identifier + * @param create try to create a new inode if set + * @param attr file attributes + * + * @return 0 on success, errno otherwise + */ + +int unifyfs_inode_metaset(int gfid, int attr_op, + unifyfs_file_attr_t* attr); + +/** + * @brief read attributes for file with @gfid. + * + * @param gfid global file identifier + * @param attr [out] file attributes to be filled + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_metaget(int gfid, unifyfs_file_attr_t* attr); + +/** + * @brief unlink file with @gfid. this will remove the target file inode from + * the global inode tree. + * + * @param gfid global file identifier + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_unlink(int gfid); + +/** + * @brief truncate size of file with @gfid to @size. + * + * @param gfid global file identifier + * @param size new file size + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_truncate(int gfid, unsigned long size); + +/** + * @brief get the local extent array from the target inode + * + * @param gfid the global file identifier + * @param n the number of extents, set by this function + * @param nodes the pointer to the array of extents, caller should free this + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_get_extents(int gfid, size_t* n, + struct extent_tree_node** nodes); + +/** + * @brief add new extents to the inode + * + * @param gfid the global file identifier + * @param n the number of new extents in @nodes + * @param nodes an array of extents to be added + * + * @return + */ +int unifyfs_inode_add_extents(int gfid, int n, struct extent_tree_node* nodes); + +/** + * @brief get the maximum file size from the local extent tree of given file + * + * @param gfid global file identifier + * @param offset [out] file offset to be filled by this function + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_get_filesize(int gfid, size_t* offset); + +/** + * @brief set the given file as laminated + * + * @param gfid global file identifier + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_laminate(int gfid); + +/** + * @brief Get chunks for given file extent + * + * @param extent target file extent + * + * @param[out] n_chunks number of output chunk locations + * @param[out] chunks array of output chunk locations + * + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, + unsigned int* n_chunks, + chunk_read_req_t** chunks); + +/** + * @brief Get chunk locations for an array of file extents + * + * @param n_extents number of input extents + * @param extents array or requested extents + * + * @param[out] n_locs number of output chunk locations + * @param[out] chunklocs array of output chunk locations + * + * @return UNIFYFS_SUCCESS, or error code + */ +int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, + unifyfs_inode_extent_t* extents, + unsigned int* n_locs, + chunk_read_req_t** chunklocs); + +/** + * @brief calls extents_tree_span, which will do: + * + * given an extent tree and starting and ending logical offsets, fill in + * key/value entries that overlap that range, returns at most max entries + * starting from lowest starting offset, sets outnum with actual number of + * entries returned + * + * @param gfid global file id + * @param start starting logical offset + * @param end ending logical offset + * @param max maximum number of key/vals to return + * @param keys array of length max for output keys + * @param vals array of length max for output values + * @param outnum number of entries returned + * + * @return + */ +int unifyfs_inode_span_extents( + int gfid, /* global file id we're looking in */ + unsigned long start, /* starting logical offset */ + unsigned long end, /* ending logical offset */ + int max, /* maximum number of key/vals to return */ + void* keys, /* array of length max for output keys */ + void* vals, /* array of length max for output values */ + int* outnum); /* number of entries returned */ + +/** + * @brief prints the inode information to the log stream + * + * @param gfid global file identifier + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_dump(int gfid); + +#endif /* __UNIFYFS_INODE_H */ + diff --git a/server/src/unifyfs_inode_tree.c b/server/src/unifyfs_inode_tree.c new file mode 100644 index 000000000..311a6a994 --- /dev/null +++ b/server/src/unifyfs_inode_tree.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include +#include + +#include "unifyfs_inode_tree.h" + +#undef MIN +#undef MAX +#define MIN(a, b) (a < b ? a : b) +#define MAX(a, b) (a > b ? a : b) + +static int unifyfs_inode_tree_compare_func( + struct unifyfs_inode* node1, + struct unifyfs_inode* node2) +{ + if (node1->gfid > node2->gfid) { + return 1; + } else if (node1->gfid < node2->gfid) { + return -1; + } else { + return 0; + } +} + +RB_PROTOTYPE( + rb_inode_tree, unifyfs_inode, + inode_tree_entry, unifyfs_inode_tree_compare_func) +RB_GENERATE( + rb_inode_tree, unifyfs_inode, + inode_tree_entry, unifyfs_inode_tree_compare_func) + +/* Returns 0 on success, positive non-zero error code otherwise */ +int unifyfs_inode_tree_init( + struct unifyfs_inode_tree* tree) +{ + int ret = 0; + + if (!tree) { + return EINVAL; + } + + memset(tree, 0, sizeof(*tree)); + ret = pthread_rwlock_init(&tree->rwlock, NULL); + RB_INIT(&tree->head); + + return ret; +} + +/* Remove and free all nodes in the unifyfs_inode_tree. */ +void unifyfs_inode_tree_destroy( + struct unifyfs_inode_tree* tree) +{ + if (tree) { + unifyfs_inode_tree_clear(tree); + pthread_rwlock_destroy(&tree->rwlock); + } +} + +int unifyfs_inode_tree_insert( + struct unifyfs_inode_tree* tree, /* tree on which to add new entry */ + struct unifyfs_inode* ino) /* initial file attribute */ +{ + int ret = 0; + struct unifyfs_inode* existing = NULL; + + if (!ino || (ino->gfid != ino->attr.gfid)) { + return EINVAL; + } + + /* check if the node already exists */ + existing = RB_FIND(rb_inode_tree, &tree->head, ino); + if (existing) { + return EEXIST; + } + + RB_INSERT(rb_inode_tree, &tree->head, ino); + + return ret; +} + +/* Search for and return entry for given gfid on specified tree. + * If not found, return NULL, assumes caller has lock on tree */ +struct unifyfs_inode* unifyfs_inode_tree_search( + struct unifyfs_inode_tree* tree, + int gfid) +{ + struct unifyfs_inode node = { .gfid = gfid, }; + + return RB_FIND(rb_inode_tree, &tree->head, &node); +} + +int unifyfs_inode_tree_remove( + struct unifyfs_inode_tree* tree, + int gfid, + struct unifyfs_inode** removed) +{ + int ret = 0; + struct unifyfs_inode* ino = NULL; + + ino = unifyfs_inode_tree_search(tree, gfid); + if (!ino) { + return ENOENT; + } + + RB_REMOVE(rb_inode_tree, &tree->head, ino); + + *removed = ino; + + return ret; +} + +/* + * Given a range tree and a starting node, iterate though all the nodes + * in the tree, returning the next one each time. If start is NULL, then + * start with the first node in the tree. + * + * This is meant to be called in a loop, like: + * + * gfid2ext_tree_rdlock(tree); + * + * struct unifyfs_inode *node = NULL; + * while ((node = gfid2ext_tree_iter(tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * gfid2ext_tree_unlock(tree); + * + * Note: this function does no locking, and assumes you're properly locking + * and unlocking the gfid2ext_tree before doing the iteration (see + * gfid2ext_tree_rdlock()/gfid2ext_tree_wrlock()/gfid2ext_tree_unlock()). + */ +struct unifyfs_inode* unifyfs_inode_tree_iter( + struct unifyfs_inode_tree* tree, + struct unifyfs_inode* start) +{ + struct unifyfs_inode* next = NULL; + if (start == NULL) { + /* Initial case, no starting node */ + next = RB_MIN(rb_inode_tree, &tree->head); + return next; + } + + /* + * We were given a valid start node. Look it up to start our traversal + * from there. + */ + next = RB_FIND(rb_inode_tree, &tree->head, start); + if (!next) { + /* Some kind of error */ + return NULL; + } + + /* Look up our next node */ + next = RB_NEXT(rb_inode_tree, &tree->head, start); + + return next; +} + +/* + * Remove all nodes in unifyfs_inode_tree, but keep it initialized so you can + * unifyfs_inode_tree_add() to it. + */ +void unifyfs_inode_tree_clear( + struct unifyfs_inode_tree* tree) +{ + struct unifyfs_inode* node = NULL; + struct unifyfs_inode* oldnode = NULL; + + unifyfs_inode_tree_wrlock(tree); + + if (RB_EMPTY(&tree->head)) { + /* unifyfs_inode_tree is empty, nothing to do */ + unifyfs_inode_tree_unlock(tree); + return; + } + + /* Remove and free each node in the tree */ + while ((node = unifyfs_inode_tree_iter(tree, node))) { + if (oldnode) { + RB_REMOVE(rb_inode_tree, &tree->head, oldnode); + if (oldnode->extents != NULL) { + extent_tree_destroy(oldnode->extents); + } + free(oldnode); + } + oldnode = node; + } + if (oldnode) { + RB_REMOVE(rb_inode_tree, &tree->head, oldnode); + if (oldnode->extents != NULL) { + extent_tree_destroy(oldnode->extents); + } + free(oldnode); + } + + unifyfs_inode_tree_unlock(tree); +} + diff --git a/server/src/unifyfs_inode_tree.h b/server/src/unifyfs_inode_tree.h new file mode 100644 index 000000000..8bec5d8d2 --- /dev/null +++ b/server/src/unifyfs_inode_tree.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef __UNIFYFS_INODE_TREE_H +#define __UNIFYFS_INODE_TREE_H + +#include +#include "tree.h" +#include "extent_tree.h" +#include "unifyfs_meta.h" +#include "unifyfs_inode.h" + +/* + * unifyfs_inode_tree: balanced binary tree (RB) for keeping active inodes. + * + * NOTE: except for unifyfs_inode_tree_destroy, none of the following functions + * perform locking itself, but the caller should accordingly lock/unlock using + * unifyfs_inode_tree_rdlock, unifyfs_inode_tree_wrlock, and + * unifyfs_inode_tree_unlock. + */ +struct unifyfs_inode_tree { + RB_HEAD(rb_inode_tree, unifyfs_inode) head; /** inode RB tree */ + pthread_rwlock_t rwlock; /** lock for accessing tree */ +}; + +/** + * @brief initialize the inode tree. + * + * @param tree the tree structure to be initialized. this should be allocated + * by the caller. + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_tree_init(struct unifyfs_inode_tree* tree); + +/** + * @brief Remove all nodes in unifyfs_inode_tree, but keep it initialized so + * you can unifyfs_inode_tree_add() to it. + * + * @param tree inode tree to remove + */ +void unifyfs_inode_tree_clear(struct unifyfs_inode_tree* tree); + +/** + * @brief Remove and free all nodes in the unifyfs_inode_tree. + * + * @param tree inode tree to remove + */ +void unifyfs_inode_tree_destroy(struct unifyfs_inode_tree* tree); + +/** + * @brief Insert a new inode to the tree. + * + * @param tree inode tree + * @param ino new inode to insert + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_tree_insert(struct unifyfs_inode_tree* tree, + struct unifyfs_inode* ino); + +/** + * @brief Remove an inode with @gfid. + * + * @param tree inode tree + * @param gfid global file identifier of the target inode + * @param removed [out] removed inode + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_tree_remove(struct unifyfs_inode_tree* tree, + int gfid, struct unifyfs_inode** removed); + +/* Search for and return extents for given gfid on specified tree. + * If not found, return NULL, assumes caller has lock on tree */ +struct unifyfs_inode* unifyfs_inode_tree_search( + struct unifyfs_inode_tree* tree, /* tree to search */ + int gfid /* global file id to find */ +); + +/** + * @brief Iterate the inode tree. + * + * Given a range tree and a starting node, iterate though all the nodes + * in the tree, returning the next one each time. If start is NULL, then + * start with the first node in the tree. + * + * This is meant to be called in a loop, like: + * + * unifyfs_inode_tree_rdlock(tree); + * + * struct unifyfs_inode *node = NULL; + * while ((node = unifyfs_inode_tree_iter(tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * unifyfs_inode_tree_unlock(tree); + * + * Note: this function does no locking, and assumes you're properly locking + * and unlocking the unifyfs_inode_tree before doing the iteration; see: + * unifyfs_inode_tree_rdlock(), unifyfs_inode_tree_wrlock(), + * unifyfs_inode_tree_unlock(). + * + * @param tree inode tree to iterate + * @param start the starting node + * + * @return inode structure + */ +struct unifyfs_inode* unifyfs_inode_tree_iter(struct unifyfs_inode_tree* tree, + struct unifyfs_inode* start); + +/* + * Locking functions for use with unifyfs_inode_tree_iter(). They allow you to + * lock the tree to iterate over it: + * + * unifyfs_inode_tree_rdlock(&tree); + * + * struct unifyfs_inode *node = NULL; + * while ((node = unifyfs_inode_tree_iter(tree, node))) { + * printf("[%d-%d]", node->start, node->end); + * } + * + * unifyfs_inode_tree_unlock(&tree); + */ + +/** + * @brief Lock a unifyfs_inode_tree for reading. This should only be used for + * calling unifyfs_inode_tree_iter(). All the other unifyfs_inode_tree + * functions provide their own locking. + * + * @param tree inode tree + * + * @return 0 on success, errno otherwise + */ +static inline int unifyfs_inode_tree_rdlock(struct unifyfs_inode_tree* tree) +{ + return pthread_rwlock_rdlock(&tree->rwlock); +} + +/** + * @brief Lock a unifyfs_inode_tree for read/write. This should only be used + * for calling unifyfs_inode_tree_iter(). All the other unifyfs_inode_tree + * functions provide their own locking. + * + * @param tree inode tree + * + * @return 0 on success, errno otherwise + */ +static inline int unifyfs_inode_tree_wrlock(struct unifyfs_inode_tree* tree) +{ + return pthread_rwlock_wrlock(&tree->rwlock); +} + +/** + * @brief Unlock a unifyfs_inode_tree for read/write. This should only be used + * for calling unifyfs_inode_tree_iter(). All the other unifyfs_inode_tree + * functions provide their own locking. + * + * @param tree inode tree + */ +static inline void unifyfs_inode_tree_unlock(struct unifyfs_inode_tree* tree) +{ + pthread_rwlock_unlock(&tree->rwlock); +} + +#endif /* __UNIFYFS_INODE_TREE_H */ + diff --git a/server/src/unifyfs_metadata.c b/server/src/unifyfs_metadata.c deleted file mode 100644 index 0f4436a65..000000000 --- a/server/src/unifyfs_metadata.c +++ /dev/null @@ -1,487 +0,0 @@ -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2017-2019, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Wekuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICNSE for full license text. - */ - -// NOTE: following two lines needed for nftw(), MUST COME FIRST IN FILE -#define _XOPEN_SOURCE 500 -#include - -// common headers -#include "unifyfs_client_rpcs.h" -#include "ucr_read_builder.h" - -// server headers -#include "unifyfs_global.h" -#include "unifyfs_metadata.h" - -// MDHIM headers -#include "indexes.h" -#include "mdhim.h" - - -unifyfs_key_t** unifyfs_keys; -unifyfs_val_t** unifyfs_vals; - -fattr_key_t** fattr_keys; -fattr_val_t** fattr_vals; - -char* manifest_path; - -struct mdhim_brm_t* brm, *brmp; -struct mdhim_bgetrm_t* bgrm, *bgrmp; - -struct mdhim_t* md; -int md_size; - -struct index_t* unifyfs_indexes[2]; -size_t max_recs_per_slice; - -void debug_log_key_val(const char* ctx, - unifyfs_key_t* key, - unifyfs_val_t* val) -{ - if ((key != NULL) && (val != NULL)) { - LOGDBG("@%s - key(fid=%d, offset=%lu), " - "val(del=%d, len=%lu, addr=%lu, app=%d, rank=%d)", - ctx, key->fid, key->offset, - val->delegator_rank, val->len, val->addr, - val->app_id, val->rank); - } else if (key != NULL) { - LOGDBG("@%s - key(fid=%d, offset=%lu)", - ctx, key->fid, key->offset); - } -} - -int unifyfs_key_compare(unifyfs_key_t* a, unifyfs_key_t* b) -{ - assert((NULL != a) && (NULL != b)); - if (a->fid == b->fid) { - if (a->offset == b->offset) { - return 0; - } else if (a->offset < b->offset) { - return -1; - } else { - return 1; - } - } else if (a->fid < b->fid) { - return -1; - } else { - return 1; - } -} - -/* initialize the key-value store */ -int meta_init_store(unifyfs_cfg_t* cfg) -{ - int rc, ratio; - size_t path_len; - long svr_ratio, range_sz; - MPI_Comm comm = MPI_COMM_WORLD; - - if (cfg == NULL) { - return -1; - } - - mdhim_options_t* db_opts = mdhim_options_init(); - if (db_opts == NULL) { - return -1; - } - mdhim_options_set_db_type(db_opts, LEVELDB); - mdhim_options_set_db_name(db_opts, cfg->meta_db_name); - mdhim_options_set_key_type(db_opts, MDHIM_UNIFYFS_KEY); - mdhim_options_set_debug_level(db_opts, MLOG_CRIT); - - /* UNIFYFS_META_DB_PATH: root directory for metadata */ - mdhim_options_set_db_path(db_opts, cfg->meta_db_path); - - /* number of metadata servers = - * number of unifyfs servers / UNIFYFS_META_SERVER_RATIO */ - svr_ratio = 0; - rc = configurator_int_val(cfg->meta_server_ratio, &svr_ratio); - if (rc != 0) { - return -1; - } - ratio = (int) svr_ratio; - mdhim_options_set_server_factor(db_opts, ratio); - - /* indices/attributes are striped to servers according - * to config setting for UNIFYFS_META_RANGE_SIZE. */ - range_sz = 0; - rc = configurator_int_val(cfg->meta_range_size, &range_sz); - if (rc != 0) { - return -1; - } - max_recs_per_slice = (size_t) range_sz; - mdhim_options_set_max_recs_per_slice(db_opts, (uint64_t)range_sz); - - md = mdhimInit(&comm, db_opts); - - /*this index is created for storing index metadata*/ - unifyfs_indexes[0] = md->primary_index; - - /*this index is created for storing file attribute metadata*/ - unifyfs_indexes[1] = create_global_index(md, ratio, 1, - LEVELDB, MDHIM_INT_KEY, - "file_attr"); - - MPI_Comm_size(md->mdhim_comm, &md_size); - - rc = meta_init_indices(); - if (rc != 0) { - return -1; - } - - return 0; -} - -/* initialize the key and value list used to put/get key-value pairs - * TODO: split once the number of metadata exceeds MAX_META_PER_SEND */ -int meta_init_indices(void) -{ - int i; - - /* init index metadata */ - unifyfs_keys = (unifyfs_key_t**) - calloc(MAX_META_PER_SEND, sizeof(unifyfs_key_t*)); - if (unifyfs_keys == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - unifyfs_vals = (unifyfs_val_t**) - calloc(MAX_META_PER_SEND, sizeof(unifyfs_val_t*)); - if (unifyfs_vals == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - for (i = 0; i < MAX_META_PER_SEND; i++) { - unifyfs_keys[i] = (unifyfs_key_t*) calloc(1, sizeof(unifyfs_key_t)); - if (unifyfs_keys[i] == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - unifyfs_vals[i] = (unifyfs_val_t*) calloc(1, sizeof(unifyfs_val_t)); - if (unifyfs_vals[i] == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - } - - /* init attribute metadata */ - fattr_keys = (fattr_key_t**) - calloc(MAX_FILE_CNT_PER_NODE, sizeof(fattr_key_t*)); - if (fattr_keys == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - fattr_vals = (fattr_val_t**) - calloc(MAX_FILE_CNT_PER_NODE, sizeof(fattr_val_t*)); - if (fattr_vals == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - for (i = 0; i < MAX_FILE_CNT_PER_NODE; i++) { - fattr_keys[i] = (fattr_key_t*) calloc(1, sizeof(fattr_key_t)); - if (fattr_keys[i] == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - fattr_vals[i] = (fattr_val_t*) calloc(1, sizeof(fattr_val_t)); - if (fattr_vals[i] == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - } - - return 0; -} - -void print_bget_indices(int app_id, int cli_id, - send_msg_t* msgs, int tot_num) -{ - int i; - for (i = 0; i < tot_num; i++) { - LOGDBG("index:dbg_rank:%d, dest_offset:%zu, " - "dest_del_rank:%d, dest_cli_id:%d, dest_app_id:%d, " - "length:%zu, src_app_id:%d, src_cli_id:%d, src_offset:%zu, " - "src_del_rank:%d, src_fid:%d, num:%d", - msgs[i].src_dbg_rank, msgs[i].dest_offset, - msgs[i].dest_delegator_rank, msgs[i].dest_client_id, - msgs[i].dest_app_id, msgs[i].length, - msgs[i].src_app_id, msgs[i].src_cli_id, - msgs[i].src_offset, msgs[i].src_delegator_rank, - msgs[i].src_fid, tot_num); - } -} - -void print_fsync_indices(unifyfs_key_t** keys, - unifyfs_val_t** vals, - size_t num_entries) -{ - size_t i; - for (i = 0; i < num_entries; i++) { - LOGDBG("fid:%d, offset:%lu, addr:%lu, len:%lu, del_id:%d", - keys[i]->fid, keys[i]->offset, - vals[i]->addr, vals[i]->len, - vals[i]->delegator_rank); - } -} - -void meta_free_indices(void) -{ - int i; - if (NULL != unifyfs_keys) { - for (i = 0; i < MAX_META_PER_SEND; i++) { - if (NULL != unifyfs_keys[i]) { - free(unifyfs_keys[i]); - } - if (NULL != unifyfs_vals[i]) { - free(unifyfs_vals[i]); - } - } - free(unifyfs_keys); - free(unifyfs_vals); - } - if (NULL != fattr_keys) { - for (i = 0; i < MAX_FILE_CNT_PER_NODE; i++) { - if (NULL != fattr_keys[i]) { - free(fattr_keys[i]); - } - if (NULL != fattr_vals[i]) { - free(fattr_vals[i]); - } - } - free(fattr_keys); - free(fattr_vals); - } -} - -static int remove_cb(const char* fpath, const struct stat* sb, - int typeflag, struct FTW* ftwbuf) -{ - int rc = remove(fpath); - if (rc) { - LOGERR("failed to remove(%s)", fpath); - } - return rc; -} - -static int remove_mdhim_db_filetree(char* db_root_path) -{ - LOGDBG("remove MDHIM DB filetree at %s", db_root_path); - return nftw(db_root_path, remove_cb, 64, FTW_DEPTH | FTW_PHYS); -} - - -int meta_sanitize(void) -{ - int rc; - char dbpath[UNIFYFS_MAX_FILENAME] = {0}; - - snprintf(dbpath, sizeof(dbpath), "%s", md->db_opts->db_path); - - mdhimClose(md); - md = NULL; - - rc = remove_mdhim_db_filetree(dbpath); - if (rc) { - LOGERR("failure during MDHIM file tree removal"); - } - - meta_free_indices(); - - return UNIFYFS_SUCCESS; -} - -// New API -/* - * - */ -int unifyfs_set_file_attribute(unifyfs_file_attr_t* fattr_ptr) -{ - int rc = UNIFYFS_SUCCESS; - - int gfid = fattr_ptr->gfid; - - md->primary_index = unifyfs_indexes[1]; - brm = mdhimPut(md, &gfid, sizeof(int), - fattr_ptr, sizeof(unifyfs_file_attr_t), - NULL, NULL); - if (!brm || brm->error) { - // return UNIFYFS_ERROR_MDHIM on error - rc = (int)UNIFYFS_ERROR_MDHIM; - } - - mdhim_full_release_msg(brm); - - return rc; -} - -/* - * - */ -int unifyfs_set_file_attributes(int num_entries, - fattr_key_t** keys, int* key_lens, - unifyfs_file_attr_t** fattr_ptr, int* val_lens) -{ - int rc = UNIFYFS_SUCCESS; - - md->primary_index = unifyfs_indexes[1]; - brm = mdhimBPut(md, (void**)keys, key_lens, (void**)fattr_ptr, - val_lens, num_entries, NULL, NULL); - brmp = brm; - if (!brmp || brmp->error) { - rc = (int)UNIFYFS_ERROR_MDHIM; - LOGERR("Error inserting keys/values into MDHIM"); - } - - while (brmp) { - if (brmp->error < 0) { - rc = (int)UNIFYFS_ERROR_MDHIM; - break; - } - - brm = brmp; - brmp = brmp->next; - mdhim_full_release_msg(brm); - } - - return rc; -} - -/* - * - */ -int unifyfs_get_file_attribute(int gfid, - unifyfs_file_attr_t* attr_val_ptr) -{ - int rc = UNIFYFS_SUCCESS; - unifyfs_file_attr_t* tmp_ptr_attr; - - md->primary_index = unifyfs_indexes[1]; - bgrm = mdhimGet(md, md->primary_index, &gfid, - sizeof(int), MDHIM_GET_EQ); - if (!bgrm || bgrm->error) { - rc = (int)UNIFYFS_ERROR_MDHIM; - } else { - tmp_ptr_attr = (unifyfs_file_attr_t*)bgrm->values[0]; - memcpy(attr_val_ptr, tmp_ptr_attr, sizeof(unifyfs_file_attr_t)); - mdhim_full_release_msg(bgrm); - } - - return rc; -} - -/* - * - */ -int unifyfs_get_file_extents(int num_keys, unifyfs_key_t** keys, - int* key_lens, int* num_values, - unifyfs_keyval_t** keyvals) -{ - /* - * This is using a modified version of mdhim. The function will return all - * key-value pairs within the range of the key tuple. - * We need to re-evaluate this function to use different key-value stores. - */ - - int i; - int rc = UNIFYFS_SUCCESS; - int tot_num = 0; - - unifyfs_key_t* tmp_key; - unifyfs_val_t* tmp_val; - unifyfs_keyval_t* kviter = *keyvals; - - md->primary_index = unifyfs_indexes[0]; - bgrm = mdhimBGet(md, md->primary_index, (void**)keys, - key_lens, num_keys, MDHIM_RANGE_BGET); - - while (bgrm) { - bgrmp = bgrm; - if (bgrmp->error < 0) { - // TODO: need better error handling - rc = (int)UNIFYFS_ERROR_MDHIM; - return rc; - } - - if (tot_num < MAX_META_PER_SEND) { - for (i = 0; i < bgrmp->num_keys; i++) { - tmp_key = (unifyfs_key_t*)bgrmp->keys[i]; - tmp_val = (unifyfs_val_t*)bgrmp->values[i]; - memcpy(&(kviter->key), tmp_key, sizeof(unifyfs_key_t)); - memcpy(&(kviter->val), tmp_val, sizeof(unifyfs_val_t)); - kviter++; - tot_num++; - if (MAX_META_PER_SEND == tot_num) { - LOGERR("Error: maximum number of values!"); - rc = UNIFYFS_FAILURE; - break; - } - } - } - bgrm = bgrmp->next; - mdhim_full_release_msg(bgrmp); - } - - *num_values = tot_num; - - return rc; -} - -/* - * - */ -int unifyfs_set_file_extents(int num_entries, - unifyfs_key_t** keys, int* key_lens, - unifyfs_val_t** vals, int* val_lens) -{ - int rc = UNIFYFS_SUCCESS; - - md->primary_index = unifyfs_indexes[0]; - - brm = mdhimBPut(md, (void**)(keys), key_lens, - (void**)(vals), val_lens, num_entries, - NULL, NULL); - brmp = brm; - if (!brmp || brmp->error) { - rc = (int)UNIFYFS_ERROR_MDHIM; - LOGERR("Error inserting keys/values into MDHIM"); - } - - while (brmp) { - if (brmp->error < 0) { - rc = (int)UNIFYFS_ERROR_MDHIM; - break; - } - - brm = brmp; - brmp = brmp->next; - mdhim_full_release_msg(brm); - } - - return rc; -} - diff --git a/server/src/unifyfs_metadata_mdhim.c b/server/src/unifyfs_metadata_mdhim.c new file mode 100644 index 000000000..5ae6635a6 --- /dev/null +++ b/server/src/unifyfs_metadata_mdhim.c @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * Copyright (c) 2017, Florida State University. Contributions from + * the Computer Architecture and Systems Research Laboratory (CASTL) + * at the Department of Computer Science. + * + * Written by: Teng Wang, Adam Moody, Wekuan Yu, Kento Sato, Kathryn Mohror + * LLNL-CODE-728877. All rights reserved. + * + * This file is part of burstfs. + * For details, see https://github.com/llnl/burstfs + * Please read https://github.com/llnl/burstfs/LICNSE for full license text. + */ + +// NOTE: following two lines needed for nftw(), MUST COME FIRST IN FILE +#define _XOPEN_SOURCE 500 +#include + +// common headers +#include "unifyfs_client_rpcs.h" + +// server headers +#include "unifyfs_global.h" +#include "unifyfs_metadata_mdhim.h" + +// MDHIM headers +#include "indexes.h" +#include "mdhim.h" + +struct mdhim_t* md; + +/* we use two MDHIM indexes: + * 0) for file extents + * 1) for file attributes */ +#define IDX_FILE_EXTENTS (0) +#define IDX_FILE_ATTR (1) +struct index_t* unifyfs_indexes[2]; + + +int unifyfs_key_compare(unifyfs_key_t* a, unifyfs_key_t* b) +{ + assert((NULL != a) && (NULL != b)); + if (a->gfid == b->gfid) { + if (a->offset == b->offset) { + return 0; + } else if (a->offset < b->offset) { + return -1; + } else { + return 1; + } + } else if (a->gfid < b->gfid) { + return -1; + } else { + return 1; + } +} + +/* order keyvals by gfid, then host delegator rank */ +int unifyfs_keyval_compare(const void* a, const void* b) +{ + assert((NULL != a) && (NULL != b)); + + const unifyfs_keyval_t* kv_a = a; + const unifyfs_keyval_t* kv_b = b; + + int gfid_a = kv_a->key.gfid; + int gfid_b = kv_b->key.gfid; + if (gfid_a == gfid_b) { + int rank_a = kv_a->val.delegator_rank; + int rank_b = kv_b->val.delegator_rank; + if (rank_a == rank_b) { + return 0; + } else if (rank_a < rank_b) { + return -1; + } else { + return 1; + } + } else if (gfid_a < gfid_b) { + return -1; + } else { + return 1; + } +} + +/* initialize the key-value store */ +int meta_init_store(unifyfs_cfg_t* cfg) +{ + int rc, ratio; + MPI_Comm comm = MPI_COMM_WORLD; + long svr_ratio, range_sz; + struct stat ss; + char db_path[UNIFYFS_MAX_FILENAME] = {0}; + + if (cfg == NULL) { + return -1; + } + + mdhim_options_t* db_opts = mdhim_options_init(); + if (db_opts == NULL) { + return -1; + } + mdhim_options_set_db_type(db_opts, LEVELDB); + mdhim_options_set_db_name(db_opts, cfg->meta_db_name); + mdhim_options_set_key_type(db_opts, MDHIM_UNIFYFS_KEY); + mdhim_options_set_debug_level(db_opts, MLOG_CRIT); + + /* UNIFYFS_META_DB_PATH: root directory for metadata */ + snprintf(db_path, sizeof(db_path), "%s/mdhim", cfg->meta_db_path); + rc = stat(db_path, &ss); + if (rc != 0) { + rc = mkdir(db_path, 0770); + if (rc != 0) { + LOGERR("failed to create MDHIM metadata directory %s", db_path); + return -1; + } + } + mdhim_options_set_db_path(db_opts, strdup(db_path)); + + /* number of metadata servers = + * number of unifyfs servers / UNIFYFS_META_SERVER_RATIO */ + svr_ratio = 0; + rc = configurator_int_val(cfg->meta_server_ratio, &svr_ratio); + if (rc != 0) { + return -1; + } + ratio = (int) svr_ratio; + mdhim_options_set_server_factor(db_opts, ratio); + + /* indices/attributes are striped to servers according + * to config setting for UNIFYFS_META_RANGE_SIZE. */ + range_sz = 0; + rc = configurator_int_val(cfg->meta_range_size, &range_sz); + if (rc != 0) { + return -1; + } + meta_slice_sz = (size_t) range_sz; + mdhim_options_set_max_recs_per_slice(db_opts, (uint64_t)range_sz); + + md = mdhimInit(&comm, db_opts); + + /* index for storing file extent metadata */ + unifyfs_indexes[IDX_FILE_EXTENTS] = md->primary_index; + + /* index for storing file attribute metadata */ + unifyfs_indexes[IDX_FILE_ATTR] = create_global_index(md, + ratio, 1, LEVELDB, MDHIM_INT_KEY, "file_attr"); + + return 0; +} + +void print_fsync_indices(unifyfs_key_t** keys, + unifyfs_val_t** vals, + size_t num_entries) +{ + size_t i; + for (i = 0; i < num_entries; i++) { + LOGDBG("gfid:%d, offset:%lu, addr:%lu, len:%lu, del_id:%d", + keys[i]->gfid, keys[i]->offset, + vals[i]->addr, vals[i]->len, + vals[i]->delegator_rank); + } +} + +static int remove_cb(const char* fpath, const struct stat* sb, + int typeflag, struct FTW* ftwbuf) +{ + int rc = remove(fpath); + if (rc) { + LOGERR("failed to remove(%s)", fpath); + } + return rc; +} + +static int remove_mdhim_db_filetree(char* db_root_path) +{ + LOGDBG("remove MDHIM DB filetree at %s", db_root_path); + return nftw(db_root_path, remove_cb, 64, FTW_DEPTH | FTW_PHYS); +} + + +int meta_sanitize(void) +{ + int rc; + char db_path[UNIFYFS_MAX_FILENAME] = {0}; + + // capture db_path before closing MDHIM + snprintf(db_path, sizeof(db_path), "%s", md->db_opts->db_path); + + mdhimClose(md); + md = NULL; + + // remove the metadata filetree + rc = remove_mdhim_db_filetree(db_path); + if (rc) { + LOGERR("failure during MDHIM file tree removal"); + } + + return UNIFYFS_SUCCESS; +} + +// New API +/* + * + */ +int unifyfs_set_file_attribute( + int set_size, + int set_laminate, + unifyfs_file_attr_t* fattr_ptr) +{ + int rc = UNIFYFS_SUCCESS; + + /* select index for file attributes */ + md->primary_index = unifyfs_indexes[IDX_FILE_ATTR]; + + int gfid = fattr_ptr->gfid; + + /* if we want to preserve some settings, + * we copy those fields from attributes + * on the existing entry, if there is one */ + int preserve = (!set_size || !set_laminate); + if (preserve) { + /* lookup existing attributes for the file */ + unifyfs_file_attr_t attr; + int get_rc = unifyfs_get_file_attribute(gfid, &attr); + if (get_rc == UNIFYFS_SUCCESS) { + /* found the attributes for this file, + * if size flag is not set, preserve existing size value */ + if (!set_size) { + fattr_ptr->size = attr.size; + } + + /* if laminate flag is not set, + * preserve existing is_laminated state */ + if (!set_laminate) { + fattr_ptr->is_laminated = attr.is_laminated; + } + } else { + /* otherwise, trying to update attributes for a file that + * we can't find */ + return get_rc; + } + } + + /* insert file attribute for given global file id */ + struct mdhim_brm_t* brm = mdhimPut(md, + &gfid, sizeof(int), + fattr_ptr, sizeof(unifyfs_file_attr_t), + NULL, NULL); + + if (!brm || brm->error) { + rc = (int)UNIFYFS_ERROR_MDHIM; + } + + if (brm) { + mdhim_full_release_msg(brm); + } + + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to insert attributes for gfid=%d", gfid); + } + return rc; +} + +/* + * + */ +int unifyfs_set_file_attributes(int num_entries, + fattr_key_t** keys, int* key_lens, + unifyfs_file_attr_t** fattr_ptr, int* val_lens) +{ + int rc = UNIFYFS_SUCCESS; + + /* select index for file attributes */ + md->primary_index = unifyfs_indexes[IDX_FILE_ATTR]; + + /* put list of key/value pairs */ + struct mdhim_brm_t* brm = mdhimBPut(md, + (void**)keys, key_lens, + (void**)fattr_ptr, val_lens, + num_entries, NULL, NULL); + + /* check for errors and free resources */ + if (!brm) { + rc = (int)UNIFYFS_ERROR_MDHIM; + } else { + /* step through linked list of messages, + * scan for any error and free messages */ + struct mdhim_brm_t* brmp = brm; + while (brmp) { + /* check current item for error */ + if (brmp->error) { + LOGERR("MDHIM bulk put error=%d", brmp->error); + rc = (int)UNIFYFS_ERROR_MDHIM; + } + + /* record pointer to current item, + * advance loop pointer to next item in list, + * free resources for current item */ + brm = brmp; + brmp = brmp->next; + mdhim_full_release_msg(brm); + } + } + + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to bulk insert file attributes"); + } + return rc; +} + +/* given a global file id, lookup and return file attributes */ +int unifyfs_get_file_attribute( + int gfid, + unifyfs_file_attr_t* attr) +{ + int rc = UNIFYFS_SUCCESS; + + /* select index holding file attributes, + * execute lookup for given file id */ + md->primary_index = unifyfs_indexes[IDX_FILE_ATTR]; + struct mdhim_bgetrm_t* bgrm = mdhimGet(md, md->primary_index, + &gfid, sizeof(int), MDHIM_GET_EQ); + + if (!bgrm || bgrm->error) { + /* failed to find info for this file id */ + rc = (int)UNIFYFS_ERROR_MDHIM; + } else { + /* copy file attribute from value into output parameter */ + unifyfs_file_attr_t* ptr = (unifyfs_file_attr_t*)bgrm->values[0]; + memcpy(attr, ptr, sizeof(unifyfs_file_attr_t)); + } + + /* free resources returned from lookup */ + if (bgrm) { + mdhim_full_release_msg(bgrm); + } + + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to retrieve attributes for gfid=%d", gfid); + } + return rc; +} + +/* given a global file id, delete file attributes */ +int unifyfs_delete_file_attribute( + int gfid) +{ + int rc = UNIFYFS_SUCCESS; + + /* select index holding file attributes, + * delete entry for given file id */ + md->primary_index = unifyfs_indexes[IDX_FILE_ATTR]; + struct mdhim_brm_t* brm = mdhimDelete(md, md->primary_index, + &gfid, sizeof(int)); + + /* check for errors and free resources */ + if (!brm) { + rc = (int)UNIFYFS_ERROR_MDHIM; + } else { + /* step through linked list of messages, + * scan for any error and free messages */ + struct mdhim_brm_t* brmp = brm; + while (brmp) { + /* check current item for error */ + if (brmp->error) { + LOGERR("MDHIM delete error=%d", brmp->error); + rc = (int)UNIFYFS_ERROR_MDHIM; + } + + /* record pointer to current item, + * advance loop pointer to next item in list, + * free resources for current item */ + brm = brmp; + brmp = brmp->next; + mdhim_full_release_msg(brm); + } + } + + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to delete attributes for gfid=%d", gfid); + } + return rc; +} + +/* + * + */ +int unifyfs_get_file_extents(int num_keys, unifyfs_key_t** keys, + int* key_lens, int* num_values, + unifyfs_keyval_t** keyvals) +{ + /* + * This is using a modified version of mdhim. The function will return all + * key-value pairs within the range of the key tuple. + * We need to re-evaluate this function to use different key-value stores. + */ + int rc = UNIFYFS_SUCCESS; + + /* initialize output values */ + *num_values = 0; + *keyvals = NULL; + + /* select index for file extents */ + md->primary_index = unifyfs_indexes[IDX_FILE_EXTENTS]; + + /* execute range query */ + struct mdhim_bgetrm_t* bkvlist = mdhimBGet(md, md->primary_index, + (void**)keys, key_lens, num_keys, MDHIM_RANGE_BGET); + + /* iterate over each item in list, check for errors + * and sum up total number of key/value pairs we got back */ + size_t tot_num = 0; + struct mdhim_bgetrm_t* ptr = bkvlist; + while (ptr) { + /* check that we don't have an error condition */ + if (ptr->error) { + /* hit an error */ + LOGERR("MDHIM range query error=%d", ptr->error); + return (int)UNIFYFS_ERROR_MDHIM; + } + + /* total up number of key/values returned */ + tot_num += (size_t) ptr->num_keys; + + /* get pointer to next item in the list */ + ptr = ptr->next; + } + + /* allocate memory to copy key/value data */ + unifyfs_keyval_t* kvs = (unifyfs_keyval_t*) calloc( + tot_num, sizeof(unifyfs_keyval_t)); + if (NULL == kvs) { + LOGERR("failed to allocate keyvals"); + return ENOMEM; + } + + /* iterate over list and copy each key/value into output array */ + ptr = bkvlist; + unifyfs_keyval_t* kviter = kvs; + while (ptr) { + /* iterate over key/value in list element */ + int i; + for (i = 0; i < ptr->num_keys; i++) { + /* get pointer to current key and value */ + unifyfs_key_t* tmp_key = (unifyfs_key_t*)ptr->keys[i]; + unifyfs_val_t* tmp_val = (unifyfs_val_t*)ptr->values[i]; + + /* copy contents over to output array */ + memcpy(&(kviter->key), tmp_key, sizeof(unifyfs_key_t)); + memcpy(&(kviter->val), tmp_val, sizeof(unifyfs_val_t)); + + /* bump up to next element in output array */ + kviter++; + } + + /* get pointer to next item in the list */ + struct mdhim_bgetrm_t* next = ptr->next; + + /* release resources for the curren item */ + mdhim_full_release_msg(ptr); + ptr = next; + } + + /* set output values */ + *num_values = tot_num; + *keyvals = kvs; + + return rc; +} + +/* + * + */ +int unifyfs_set_file_extents(int num_entries, + unifyfs_key_t** keys, int* key_lens, + unifyfs_val_t** vals, int* val_lens) +{ + int rc = UNIFYFS_SUCCESS; + + /* select index for file extents */ + md->primary_index = unifyfs_indexes[IDX_FILE_EXTENTS]; + + /* put list of key/value pairs */ + struct mdhim_brm_t* brm = mdhimBPut(md, + (void**)(keys), key_lens, + (void**)(vals), val_lens, + num_entries, NULL, NULL); + + /* check for errors and free resources */ + if (!brm) { + rc = (int)UNIFYFS_ERROR_MDHIM; + } else { + /* step through linked list of messages, + * scan for any error and free messages */ + struct mdhim_brm_t* brmp = brm; + while (brmp) { + /* check current item for error */ + if (brmp->error) { + LOGERR("MDHIM bulk put error=%d", brmp->error); + rc = (int)UNIFYFS_ERROR_MDHIM; + } + + /* record pointer to current item, + * advance loop pointer to next item in list, + * free resources for current item */ + brm = brmp; + brmp = brmp->next; + mdhim_full_release_msg(brm); + } + } + + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to bulk insert file extents"); + } + return rc; +} + +/* delete the listed keys from the file extents */ +int unifyfs_delete_file_extents( + int num_entries, /* number of items in keys list */ + unifyfs_key_t** keys, /* list of keys to be deleted */ + int* key_lens) /* list of byte sizes for keys list items */ +{ + /* assume we'll succeed */ + int rc = UNIFYFS_SUCCESS; + + /* select index for file extents */ + md->primary_index = unifyfs_indexes[IDX_FILE_EXTENTS]; + + /* delete list of key/value pairs */ + struct mdhim_brm_t* brm = mdhimBDelete(md, md->primary_index, + (void**)(keys), key_lens, num_entries); + + /* check for errors and free resources */ + if (!brm) { + rc = (int)UNIFYFS_ERROR_MDHIM; + } else { + /* step through linked list of messages, + * scan for any error and free messages */ + struct mdhim_brm_t* brmp = brm; + while (brmp) { + /* check current item for error */ + if (brmp->error) { + LOGERR("MDHIM bulk delete error=%d", brmp->error); + rc = (int)UNIFYFS_ERROR_MDHIM; + } + + /* record pointer to current item, + * advance loop pointer to next item in list, + * free resources for current item */ + brm = brmp; + brmp = brmp->next; + mdhim_full_release_msg(brm); + } + } + + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to bulk delete file extents"); + } + return rc; +} diff --git a/server/src/unifyfs_metadata.h b/server/src/unifyfs_metadata_mdhim.h similarity index 60% rename from server/src/unifyfs_metadata.h rename to server/src/unifyfs_metadata_mdhim.h index 200c78742..94ef193c7 100644 --- a/server/src/unifyfs_metadata.h +++ b/server/src/unifyfs_metadata_mdhim.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017-2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -27,26 +27,29 @@ * Please read https://github.com/llnl/burstfs/LICENSE for full license text. */ -#ifndef UNIFYFS_METADATA_H -#define UNIFYFS_METADATA_H +#ifndef UNIFYFS_METADATA_MDHIM_H +#define UNIFYFS_METADATA_MDHIM_H #include "unifyfs_configurator.h" -#include "unifyfs_global.h" +#include "unifyfs_log.h" +#include "unifyfs_meta.h" + -#define MANIFEST_FILE_NAME "mdhim_manifest_" +/* Key for file attributes */ +typedef int fattr_key_t; /** * Key for a file extent */ typedef struct { /** global file id */ - int fid; + int gfid; /** logical file offset */ size_t offset; } unifyfs_key_t; #define UNIFYFS_KEY_SZ (sizeof(unifyfs_key_t)) -#define UNIFYFS_KEY_FID(keyp) (((unifyfs_key_t*)keyp)->fid) +#define UNIFYFS_KEY_FID(keyp) (((unifyfs_key_t*)keyp)->gfid) #define UNIFYFS_KEY_OFF(keyp) (((unifyfs_key_t*)keyp)->offset) typedef struct { @@ -72,18 +75,14 @@ typedef struct { } unifyfs_keyval_t; int unifyfs_key_compare(unifyfs_key_t* a, unifyfs_key_t* b); +int unifyfs_keyval_compare(const void* a, const void* b); -void debug_log_key_val(const char* ctx, - unifyfs_key_t* key, - unifyfs_val_t* val); +/* return number of slice ranges needed to cover range */ +size_t meta_num_slices(size_t offset, size_t length); int meta_sanitize(void); int meta_init_store(unifyfs_cfg_t* cfg); -void print_bget_indices(int app_id, int client_id, - send_msg_t* index_set, int tot_num); -int meta_init_indices(void); -void meta_free_indices(void); void print_fsync_indices(unifyfs_key_t** unifyfs_keys, unifyfs_val_t** unifyfs_vals, size_t num_entries); @@ -97,13 +96,26 @@ void print_fsync_indices(unifyfs_key_t** unifyfs_keys, int unifyfs_get_file_attribute(int gfid, unifyfs_file_attr_t* ptr_attr_val); +/** + * Delete file attribute from the KV-Store. + * + * @param [in] gfid + * @return UNIFYFS_SUCCESS on success + */ +int unifyfs_delete_file_attribute(int gfid); + /** * Store a File attribute to the KV-Store. * + * @param[in] size_flag + * @param[in] laminate_flag * @param[in] *ptr_attr_val * @return UNIFYFS_SUCCESS on success */ -int unifyfs_set_file_attribute(unifyfs_file_attr_t* ptr_attr_val); +int unifyfs_set_file_attribute( + int size_flag, + int laminate_flag, + unifyfs_file_attr_t* ptr_attr_val); /** * Store File attributes to the KV-Store. @@ -132,6 +144,16 @@ int unifyfs_get_file_extents(int num_keys, unifyfs_key_t** keys, int* key_lens, int* num_values, unifyfs_keyval_t** keyval); +/** + * Delete File extents from the KV-Store. + * + * @param[in] num_entries number of key value pairs to delete + * @param[in] keys array storing the keys + * @param[in] key_lens array with the length of the elements in \p keys + */ +int unifyfs_delete_file_extents(int num_entries, + unifyfs_key_t** keys, int* key_lens); + /** * Store File extents in the KV-Store. * @@ -146,4 +168,70 @@ int unifyfs_set_file_extents(int num_entries, unifyfs_key_t** keys, int* key_lens, unifyfs_val_t** vals, int* val_lens); -#endif + + +static inline +unifyfs_key_t** alloc_key_array(int elems) +{ + int size = elems * (sizeof(unifyfs_key_t*) + sizeof(unifyfs_key_t)); + + void* mem_block = calloc(size, sizeof(char)); + + unifyfs_key_t** array_ptr = mem_block; + unifyfs_key_t* key_ptr = (unifyfs_key_t*)(array_ptr + elems); + + for (int i = 0; i < elems; i++) { + array_ptr[i] = &key_ptr[i]; + } + + return (unifyfs_key_t**)mem_block; +} + +static inline +unifyfs_val_t** alloc_value_array(int elems) +{ + int size = elems * (sizeof(unifyfs_val_t*) + sizeof(unifyfs_val_t)); + + void* mem_block = calloc(size, sizeof(char)); + + unifyfs_val_t** array_ptr = mem_block; + unifyfs_val_t* key_ptr = (unifyfs_val_t*)(array_ptr + elems); + + for (int i = 0; i < elems; i++) { + array_ptr[i] = &key_ptr[i]; + } + + return (unifyfs_val_t**)mem_block; +} + +static inline +void free_key_array(unifyfs_key_t** array) +{ + free(array); +} + +static inline +void free_value_array(unifyfs_val_t** array) +{ + free(array); +} + +static inline +void debug_log_key_val(const char* ctx, + unifyfs_key_t* key, + unifyfs_val_t* val) +{ + if ((key != NULL) && (val != NULL)) { + LOGDBG("@%s - key(gfid=%d, offset=%lu), " + "val(del=%d, len=%lu, addr=%lu, app=%d, rank=%d)", + ctx, key->gfid, key->offset, + val->delegator_rank, val->len, val->addr, + val->app_id, val->rank); + } else if (key != NULL) { + LOGDBG("@%s - key(gfid=%d, offset=%lu)", + ctx, key->gfid, key->offset); + } +} + + +#endif // UNIFYFS_METADATA_MDHIM_H diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c new file mode 100644 index 000000000..c41e379ca --- /dev/null +++ b/server/src/unifyfs_p2p_rpc.c @@ -0,0 +1,990 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_global.h" +#include "margo_server.h" +#include "unifyfs_server_rpcs.h" +#include "unifyfs_p2p_rpc.h" +#include "unifyfs_group_rpc.h" + +/************************************************************************* + * Peer-to-peer RPC helper methods + *************************************************************************/ + +/* determine server responsible for maintaining target file's metadata */ +int hash_gfid_to_server(int gfid) +{ + return gfid % glb_pmi_size; +} + +/* server peer-to-peer (p2p) margo request structure */ +typedef struct { + margo_request request; + hg_addr_t peer; + hg_handle_t handle; +} p2p_request; + +/* helper method to initialize peer request rpc handle */ +static +int get_request_handle(hg_id_t request_hgid, + int peer_rank, + p2p_request* req) +{ + int rc = UNIFYFS_SUCCESS; + + /* get address for specified server rank */ + req->peer = glb_servers[peer_rank].margo_svr_addr; + + /* get handle to rpc function */ + hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, req->peer, + request_hgid, &(req->handle)); + if (hret != HG_SUCCESS) { + LOGERR("failed to get handle for p2p request(%p) to server %d", + req, peer_rank); + rc = UNIFYFS_ERROR_MARGO; + } + + return rc; +} + +/* helper method to forward peer rpc request */ +static +int forward_request(void* input_ptr, + p2p_request* req) +{ + int rc = UNIFYFS_SUCCESS; + + /* call rpc function */ + hg_return_t hret = margo_iforward(req->handle, input_ptr, + &(req->request)); + if (hret != HG_SUCCESS) { + LOGERR("failed to forward p2p request(%p)", req); + rc = UNIFYFS_ERROR_MARGO; + } + + return rc; +} + +/* helper method to wait for peer rpc request completion */ +static +int wait_for_request(p2p_request* req) +{ + int rc = UNIFYFS_SUCCESS; + + /* call rpc function */ + hg_return_t hret = margo_wait(req->request); + if (hret != HG_SUCCESS) { + LOGERR("wait on p2p request(%p) failed", req); + rc = UNIFYFS_ERROR_MARGO; + } + + return rc; +} + +/************************************************************************* + * File extents metadata update request + *************************************************************************/ + +/* Add extents rpc handler */ +static void add_extents_rpc(hg_handle_t handle) +{ + LOGDBG("add_extents rpc handler"); + + /* assume we'll succeed */ + int32_t ret = UNIFYFS_SUCCESS; + + const struct hg_info* hgi = margo_get_info(handle); + assert(hgi); + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + /* get input params */ + add_extents_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + int sender = in.src_rank; + int gfid = in.gfid; + size_t num_extents = (size_t) in.num_extents; + size_t bulk_sz = num_extents * sizeof(struct extent_tree_node); + + /* allocate memory for extents */ + void* extents_buf = malloc(bulk_sz); + if (NULL == extents_buf) { + LOGERR("allocation for bulk extents failed"); + ret = ENOMEM; + } else { + /* register local target buffer for bulk access */ + hg_bulk_t bulk_handle; + hret = margo_bulk_create(mid, 1, &extents_buf, &bulk_sz, + HG_BULK_WRITE_ONLY, &bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* get list of read requests */ + hret = margo_bulk_transfer(mid, HG_BULK_PULL, + hgi->addr, in.extents, 0, + bulk_handle, 0, + bulk_sz); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* store new extents */ + LOGDBG("received %zu extents for gfid=%d from %d", + num_extents, gfid, sender); + struct extent_tree_node* extents = extents_buf; + ret = unifyfs_inode_add_extents(gfid, num_extents, extents); + if (ret) { + LOGERR("failed to add extents from %d (ret=%d)", + sender, ret); + } + } + margo_bulk_free(bulk_handle); + } + free(extents_buf); + } + margo_free_input(handle, &in); + } + + /* build our output values */ + add_extents_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(add_extents_rpc) + +/* Add extents to target file */ +int unifyfs_invoke_add_extents_rpc(int gfid, + unsigned int num_extents, + struct extent_tree_node* extents) +{ + int owner_rank = hash_gfid_to_server(gfid); + if (owner_rank == glb_pmi_rank) { + /* I'm the owner, already did local add */ + return UNIFYFS_SUCCESS; + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_add_id; + int rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* create a margo bulk transfer handle for extents array */ + hg_bulk_t bulk_handle; + void* buf = (void*) extents; + size_t buf_sz = (size_t)num_extents * sizeof(struct extent_tree_node); + hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, + 1, &buf, &buf_sz, + HG_BULK_READ_ONLY, &bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + return UNIFYFS_ERROR_MARGO; + } + + /* fill rpc input struct and forward request */ + add_extents_in_t in; + in.src_rank = (int32_t) glb_pmi_rank; + in.gfid = (int32_t) gfid; + in.num_extents = (int32_t) num_extents; + in.extents = bulk_handle; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + margo_bulk_free(bulk_handle); + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + add_extents_out_t out; + hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +/************************************************************************* + * File extents metadata lookup request + *************************************************************************/ + +/* find extents rpc handler */ +static void find_extents_rpc(hg_handle_t handle) +{ + LOGDBG("find_extents rpc handler"); + + int32_t ret; + unsigned int num_chunks = 0; + chunk_read_req_t* chunk_locs = NULL; + + const struct hg_info* hgi = margo_get_info(handle); + assert(hgi); + + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + /* get input params */ + find_extents_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + int sender = in.src_rank; + int gfid = in.gfid; + size_t num_extents = (size_t) in.num_extents; + size_t bulk_sz = num_extents * sizeof(unifyfs_inode_extent_t); + + /* allocate memory for extents */ + void* extents_buf = malloc(bulk_sz); + if (NULL == extents_buf) { + LOGERR("allocation for bulk extents failed"); + ret = ENOMEM; + } else { + /* register local target buffer for bulk access */ + hg_bulk_t bulk_req_handle; + hret = margo_bulk_create(mid, 1, &extents_buf, &bulk_sz, + HG_BULK_WRITE_ONLY, &bulk_req_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* get list of read requests */ + hret = margo_bulk_transfer(mid, HG_BULK_PULL, + hgi->addr, in.extents, 0, + bulk_req_handle, 0, + bulk_sz); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* lookup requested extents */ + LOGDBG("received %zu extent lookups for gfid=%d from %d", + num_extents, gfid, sender); + unifyfs_inode_extent_t* extents = extents_buf; + ret = unifyfs_inode_resolve_extent_chunks(num_extents, + extents, + &num_chunks, + &chunk_locs); + if (ret) { + LOGERR("failed to find extents for %d (ret=%d)", + sender, ret); + } + } + margo_bulk_free(bulk_req_handle); + } + free(extents_buf); + } + margo_free_input(handle, &in); + } + + /* define a bulk handle to transfer chunk address info */ + hg_bulk_t bulk_resp_handle = HG_BULK_NULL; + if (ret == UNIFYFS_SUCCESS) { + if (num_chunks > 0) { + void* buf = (void*) chunk_locs; + size_t buf_sz = (size_t)num_chunks * sizeof(chunk_read_req_t); + hret = margo_bulk_create(mid, 1, &buf, &buf_sz, + HG_BULK_READ_ONLY, &bulk_resp_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + } + } + + /* fill rpc response struct with output values */ + find_extents_out_t out; + out.ret = ret; + out.num_locations = num_chunks; + out.locations = bulk_resp_handle; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + if (bulk_resp_handle != HG_BULK_NULL) { + margo_bulk_free(bulk_resp_handle); + } + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(find_extents_rpc) + +/* Lookup extent locations for target file */ +int unifyfs_invoke_find_extents_rpc(int gfid, + unsigned int num_extents, + unifyfs_inode_extent_t* extents, + unsigned int* num_chunks, + chunk_read_req_t** chunks) +{ + if ((NULL == num_chunks) || (NULL == chunks)) { + return EINVAL; + } + *num_chunks = 0; + *chunks = NULL; + + int owner_rank = hash_gfid_to_server(gfid); + + /* do local inode metadata lookup to check for laminated */ + unifyfs_file_attr_t attrs; + int ret = unifyfs_inode_metaget(gfid, &attrs); + if (ret == UNIFYFS_SUCCESS) { + if (attrs.is_laminated || (owner_rank == glb_pmi_rank)) { + /* do local lookup */ + ret = unifyfs_inode_resolve_extent_chunks((size_t)num_extents, + extents, + num_chunks, chunks); + if (ret) { + LOGERR("failed to find extents for gfid=%d (ret=%d)", + gfid, ret); + } + return ret; + } + } + + /* forward request to file owner */ + p2p_request preq; + margo_instance_id mid = unifyfsd_rpc_context->svr_mid; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_lookup_id; + int rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* create a margo bulk transfer handle for extents array */ + hg_bulk_t bulk_req_handle; + void* buf = (void*) extents; + size_t buf_sz = (size_t)num_extents * sizeof(unifyfs_inode_extent_t); + hg_return_t hret = margo_bulk_create(mid, 1, &buf, &buf_sz, + HG_BULK_READ_ONLY, &bulk_req_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + return UNIFYFS_ERROR_MARGO; + } + + /* fill rpc input struct and forward request */ + find_extents_in_t in; + in.src_rank = (int32_t) glb_pmi_rank; + in.gfid = (int32_t) gfid; + in.num_extents = (int32_t) num_extents; + in.extents = bulk_req_handle; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + margo_bulk_free(bulk_req_handle); + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + find_extents_out_t out; + hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + if (ret == UNIFYFS_SUCCESS) { + /* get number of chunks */ + unsigned int n_chks = (unsigned int) out.num_locations; + if (n_chks > 0) { + /* got some chunks to read, allocate a buffer + * to hold chunk location data */ + buf_sz = (size_t)n_chks * sizeof(chunk_read_req_t); + buf = malloc(buf_sz); + if (NULL == buf) { + LOGERR("allocation for bulk locations failed"); + ret = ENOMEM; + } else { + /* create a margo bulk transfer handle for + * locations array */ + hg_bulk_t bulk_resp_handle; + hret = margo_bulk_create(mid, 1, &buf, &buf_sz, + HG_BULK_WRITE_ONLY, + &bulk_resp_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* pull locations array */ + hret = margo_bulk_transfer(mid, HG_BULK_PULL, + preq.peer, out.locations, 0, + bulk_resp_handle, 0, + buf_sz); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* lookup requested extents */ + LOGDBG("received %u chunk locations for gfid=%d", + n_chks, gfid); + *chunks = (chunk_read_req_t*) buf; + *num_chunks = (unsigned int) n_chks; + } + margo_bulk_free(bulk_resp_handle); + } + } + } + } + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +/************************************************************************* + * File attributes request + *************************************************************************/ + +/* Metaget rpc handler */ +static void metaget_rpc(hg_handle_t handle) +{ + LOGDBG("metaget rpc handler"); + + int32_t ret; + + /* initialize invalid attributes */ + unifyfs_file_attr_t attrs; + unifyfs_file_attr_set_invalid(&attrs); + + /* get input params */ + metaget_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + ret = unifyfs_inode_metaget(in.gfid, &attrs); + margo_free_input(handle, &in); + } + + /* fill output values */ + metaget_out_t out; + out.ret = ret; + out.attr = attrs; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(metaget_rpc) + +/* Get file attributes for target file */ +int unifyfs_invoke_metaget_rpc(int gfid, + unifyfs_file_attr_t* attrs) +{ + if (NULL == attrs) { + return EINVAL; + } + + int owner_rank = hash_gfid_to_server(gfid); + + /* do local inode metadata lookup to check for laminated */ + int rc = unifyfs_inode_metaget(gfid, attrs); + if ((rc == UNIFYFS_SUCCESS) && (attrs->is_laminated)) { + /* if laminated, we already have final metadata locally */ + return UNIFYFS_SUCCESS; + } + if (owner_rank == glb_pmi_rank) { + return rc; + } + + int need_local_metadata = 0; + if (rc == ENOENT) { + /* inode_metaget above failed with ENOENT, need to create inode */ + need_local_metadata = 1; + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.metaget_id; + rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + metaget_in_t in; + in.gfid = (int32_t)gfid; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + metaget_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + if (ret == UNIFYFS_SUCCESS) { + *attrs = out.attr; + if (out.attr.filename != NULL) { + attrs->filename = strdup(out.attr.filename); + } + if (need_local_metadata) { + unifyfs_inode_metaset(gfid, UNIFYFS_FILE_ATTR_OP_CREATE, + attrs); + } + } + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +/************************************************************************* + * File size request + *************************************************************************/ + +/* Filesize rpc handler */ +static void filesize_rpc(hg_handle_t handle) +{ + LOGDBG("filesize rpc handler"); + + int32_t ret; + hg_size_t filesize = 0; + + /* get input params */ + filesize_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + ret = unifyfs_inode_get_filesize(in.gfid, &filesize); + margo_free_input(handle, &in); + } + + /* build our output values */ + filesize_out_t out; + out.ret = ret; + out.filesize = filesize; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(filesize_rpc) + +/* Get current global size for the target file */ +int unifyfs_invoke_filesize_rpc(int gfid, + size_t* filesize) +{ + if (NULL == filesize) { + return EINVAL; + } + + int owner_rank = hash_gfid_to_server(gfid); + + /* do local inode metadata lookup to check for laminated */ + unifyfs_file_attr_t attrs; + int rc = unifyfs_inode_metaget(gfid, &attrs); + if ((rc == UNIFYFS_SUCCESS) && (attrs.is_laminated)) { + /* if laminated, we already have final metadata stored locally */ + *filesize = (size_t) attrs.size; + return UNIFYFS_SUCCESS; + } + if (owner_rank == glb_pmi_rank) { + *filesize = (size_t) attrs.size; + return rc; + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.filesize_id; + rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + filesize_in_t in; + in.gfid = (int32_t)gfid; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + filesize_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + if (ret == UNIFYFS_SUCCESS) { + *filesize = (size_t) out.filesize; + } + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +/************************************************************************* + * File attributes update request + *************************************************************************/ + +/* Metaset rpc handler */ +static void metaset_rpc(hg_handle_t handle) +{ + LOGDBG("metaset rpc handler"); + + int32_t ret; + + /* get input params */ + metaset_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + unifyfs_file_attr_op_e attr_op = in.fileop; + ret = unifyfs_inode_metaset(in.gfid, attr_op, &(in.attr)); + margo_free_input(handle, &in); + } + + /* build our output values */ + metaset_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(metaset_rpc) + +/* Set metadata for target file */ +int unifyfs_invoke_metaset_rpc(int gfid, + int attr_op, + unifyfs_file_attr_t* attrs) +{ + if (NULL == attrs) { + return EINVAL; + } + + int owner_rank = hash_gfid_to_server(gfid); + if (owner_rank == glb_pmi_rank) { + /* I'm the owner, do local inode metadata update */ + return unifyfs_inode_metaset(gfid, attr_op, attrs); + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.metaset_id; + int rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + metaset_in_t in; + in.gfid = (int32_t) gfid; + in.fileop = (int32_t) attr_op; + in.attr = *attrs; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + metaset_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + + /* if update at owner succeeded, do it locally */ + if (ret == UNIFYFS_SUCCESS) { + ret = unifyfs_inode_metaset(gfid, attr_op, attrs); + } + } + margo_destroy(preq.handle); + + return ret; +} + +/************************************************************************* + * File lamination request + *************************************************************************/ + +/* Laminate rpc handler */ +static void laminate_rpc(hg_handle_t handle) +{ + LOGDBG("laminate rpc handler"); + + int32_t ret; + + /* get input params */ + laminate_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + int gfid = (int) in.gfid; + margo_free_input(handle, &in); + + ret = unifyfs_inode_laminate(gfid); + if (ret == UNIFYFS_SUCCESS) { + /* tell the rest of the servers */ + ret = unifyfs_invoke_broadcast_laminate(gfid); + } + } + + /* build our output values */ + laminate_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(laminate_rpc) + +/* Laminate the target file */ +int unifyfs_invoke_laminate_rpc(int gfid) +{ + int ret; + int owner_rank = hash_gfid_to_server(gfid); + if (owner_rank == glb_pmi_rank) { + /* I'm the owner, do local inode metadata update */ + ret = unifyfs_inode_laminate(gfid); + if (ret == UNIFYFS_SUCCESS) { + /* tell the rest of the servers */ + ret = unifyfs_invoke_broadcast_laminate(gfid); + } + return ret; + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.laminate_id; + int rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + laminate_in_t in; + in.gfid = (int32_t)gfid; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + laminate_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +/************************************************************************* + * File truncation request + *************************************************************************/ + +/* Truncate rpc handler */ +static void truncate_rpc(hg_handle_t handle) +{ + LOGDBG("truncate rpc handler"); + + int32_t ret; + + /* get input params */ + truncate_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + int gfid = (int) in.gfid; + size_t fsize = (size_t) in.filesize; + ret = unifyfs_invoke_broadcast_truncate(gfid, fsize); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("truncate(gfid=%d, size=%zu) broadcast failed", + gfid, fsize); + } + margo_free_input(handle, &in); + } + + /* build our output values */ + truncate_out_t out; + out.ret = ret; + + /* send output back to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(truncate_rpc) + +/* Truncate the target file */ +int unifyfs_invoke_truncate_rpc(int gfid, + size_t filesize) +{ + int owner_rank = hash_gfid_to_server(gfid); + if (owner_rank == glb_pmi_rank) { + /* I'm the owner, start broadcast update. The local inode will be + * updated as part of this update. */ + return unifyfs_invoke_broadcast_truncate(gfid, filesize); + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.truncate_id; + int rc = get_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + truncate_in_t in; + in.gfid = (int32_t) gfid; + in.filesize = (hg_size_t) filesize; + rc = forward_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + truncate_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} diff --git a/server/src/unifyfs_p2p_rpc.h b/server/src/unifyfs_p2p_rpc.h new file mode 100644 index 000000000..9710a8ca4 --- /dev/null +++ b/server/src/unifyfs_p2p_rpc.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef _UNIFYFS_P2P_RPC_H +#define _UNIFYFS_P2P_RPC_H + +#include "unifyfs_global.h" +#include "extent_tree.h" +#include "unifyfs_inode.h" + + +/* Point-to-point Server RPCs */ + + +/* determine server responsible for maintaining target file's metadata */ +int hash_gfid_to_server(int gfid); + + +/** + * @brief Add new extents to target file + * + * @param gfid target file + * @param num_extents length of file extents array + * @param extents array of extents to add + * + * @return success|failure + */ +int unifyfs_invoke_add_extents_rpc(int gfid, + unsigned int num_extents, + struct extent_tree_node* extents); + +/** + * @brief Find location of extents for target file + * + * @param gfid target file + * @param num_extents length of file extents array + * @param extents array of extents to find + * + * @param[out] num_chunks number of chunk locations + * @param[out] chunks array of chunk locations for requested extents + * + * @return success|failure + */ +int unifyfs_invoke_find_extents_rpc(int gfid, + unsigned int num_extents, + unifyfs_inode_extent_t* extents, + unsigned int* num_chunks, + chunk_read_req_t** chunks); + +/** + * @brief Get file size for the target file + * + * @param gfid target file + * @param filesize pointer to size variable + * + * @return success|failure + */ +int unifyfs_invoke_filesize_rpc(int gfid, + size_t* filesize); + +/** + * @brief Laminate the target file + * + * @param gfid target file + * + * @return success|failure + */ +int unifyfs_invoke_laminate_rpc(int gfid); + +/** + * @brief Get metadata for target file + * + * @param gfid target file + * @param create flag indicating if this is a newly created file + * @param attr file attributes to update + * + * @return success|failure + */ +int unifyfs_invoke_metaget_rpc(int gfid, + unifyfs_file_attr_t* attrs); + +/** + * @brief Update metadata for target file + * + * @param gfid target file + * @param attr_op metadata operation that triggered update + * @param attr file attributes to update + * + * @return success|failure + */ +int unifyfs_invoke_metaset_rpc(int gfid, int attr_op, + unifyfs_file_attr_t* attrs); + +/** + * @brief Truncate target file + * + * @param gfid target file + * @param filesize truncated file size + * + * @return success|failure + */ +int unifyfs_invoke_truncate_rpc(int gfid, size_t filesize); + + +#endif // UNIFYFS_P2P_RPC_H diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index c3baa1e7e..1ffa869df 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017-2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -27,75 +27,68 @@ * Please read https://github.com/llnl/burstfs/LICENSE for full license text. */ -// system headers -#include -#include -#include -#include -#include -#include - // general support #include "unifyfs_global.h" -#include "unifyfs_log.h" // server components +#include "unifyfs_inode_tree.h" +#include "unifyfs_metadata_mdhim.h" #include "unifyfs_request_manager.h" #include "unifyfs_service_manager.h" -#include "unifyfs_metadata.h" // margo rpcs +#include "unifyfs_group_rpc.h" #include "unifyfs_server_rpcs.h" #include "margo_server.h" -#include "ucr_read_builder.h" +// system headers +#include // nanosleep #define RM_LOCK(rm) \ do { \ - LOGDBG("locking RM[%d] state", rm->thrd_ndx); \ + LOGDBG("locking RM[%d:%d] state", rm->app_id, rm->client_id); \ pthread_mutex_lock(&(rm->thrd_lock)); \ } while (0) #define RM_UNLOCK(rm) \ do { \ - LOGDBG("unlocking RM[%d] state", rm->thrd_ndx); \ + LOGDBG("unlocking RM[%d:%d] state", rm->app_id, rm->client_id); \ pthread_mutex_unlock(&(rm->thrd_lock)); \ } while (0) -arraylist_t* rm_thrd_list; +#define RM_REQ_LOCK(rm) \ +do { \ + LOGDBG("locking RM[%d:%d] reqs", rm->app_id, rm->client_id); \ + ABT_mutex_lock(rm->reqs_sync); \ +} while (0) -/* One request manager thread is created for each client that a - * delegator serves. The main thread of the delegator assigns - * work to the request manager thread to retrieve data and send - * it back to the client. - * - * To start, given a read request from the client (via rpc) - * the handler function on the main delegator first queries the - * key/value store using the given file id and byte range to obtain - * the meta data on the physical location of the file data. This - * meta data provides the host delegator rank, the app/client - * ids that specify the log file on the remote delegator, the - * offset within the log file and the length of data. The rpc - * handler function sorts the meta data by host delegator rank, - * generates read requests, and inserts those into a list on a - * data structure shared with the request manager (del_req_set). +#define RM_REQ_UNLOCK(rm) \ +do { \ + LOGDBG("unlocking RM[%d:%d] reqs", rm->app_id, rm->client_id); \ + ABT_mutex_unlock(rm->reqs_sync); \ +} while (0) + +/* One request manager thread is created for each client of the + * server. The margo rpc handler thread(s) assign work to the + * request manager thread to handle data and metadata operations. * - * The request manager thread coordinates with the main thread + * The request manager thread coordinates with other threads * using a lock and a condition variable to protect the shared data - * structure and impose flow control. When assigned work, the - * request manager thread packs and sends request messages to - * service manager threads on remote delegators via MPI send. - * It waits for data to be sent back, and unpacks the read replies - * in each message into a shared memory buffer for the client. - * When the shared memory is full or all data has been received, - * it signals the client process to process the read replies. - * It iterates with the client until all incoming read replies - * have been transferred. */ -/* create a request manager thread for the given app_id - * and client_id, returns pointer to thread control structure - * on success and NULL on failure */ - -/* Create Request Manager thread for application client */ + * structure and impose flow control. When assigned work, the + * request manager thread either handles the request directly, or + * forwards requests to remote servers. + * + * For read requests, the request manager waits for data chunk + * responses and places the data into a shared memory data buffer + * specific to the client. When the shared memory is full or all + * data has been received, the request manager signals the client + * to process the read replies. It iterates with the client until + * all incoming read replies have been transferred. */ + +/* Create a request manager thread for the application client + * corresponding to the given app_id and client_id. + * Returns pointer to thread control structure on success, or + * NULL on failure */ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) { /* allocate a new thread control structure */ @@ -108,45 +101,8 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) return NULL; } - /* allocate an array for listing read requests from client */ - thrd_ctrl->del_req_set = (msg_meta_t*)calloc(1, sizeof(msg_meta_t)); - if (thrd_ctrl->del_req_set == NULL) { - LOGERR("Failed to allocate read request structure for request " - "manager thread for app_id=%d client_id=%d", - app_id, client_id); - free(thrd_ctrl); - return NULL; - } - - /* allocate structure for tracking outstanding read requests - * this delegator has with service managers on other nodes */ - thrd_ctrl->del_req_stat = (del_req_stat_t*) - calloc(1, sizeof(del_req_stat_t)); - if (thrd_ctrl->del_req_stat == NULL) { - LOGERR("Failed to allocate delegator structure for request " - "manager thread for app_id=%d client_id=%d", - app_id, client_id); - free(thrd_ctrl->del_req_set); - free(thrd_ctrl); - return NULL; - } - - /* allocate a structure to track requests we have on each - * remote service manager */ - thrd_ctrl->del_req_stat->req_stat = (per_del_stat_t*) - calloc(glb_mpi_size, sizeof(per_del_stat_t)); - if (thrd_ctrl->del_req_stat->req_stat == NULL) { - LOGERR("Failed to allocate per-delegator structure for request " - "manager thread for app_id=%d client_id=%d", - app_id, client_id); - free(thrd_ctrl->del_req_stat); - free(thrd_ctrl->del_req_set); - free(thrd_ctrl); - return NULL; - } - - /* initialize lock for shared data structures between - * main thread and request delegator thread */ + /* initialize lock for shared data structures of the + * request manager */ pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); @@ -155,24 +111,30 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) LOGERR("pthread_mutex_init failed for request " "manager thread app_id=%d client_id=%d rc=%d (%s)", app_id, client_id, rc, strerror(rc)); - free(thrd_ctrl->del_req_stat->req_stat); - free(thrd_ctrl->del_req_stat); - free(thrd_ctrl->del_req_set); free(thrd_ctrl); return NULL; } - /* initailize condition variable to flow control - * work between main thread and request delegator thread */ + /* initialize condition variable to synchronize work + * notifications for the request manager thread */ rc = pthread_cond_init(&(thrd_ctrl->thrd_cond), NULL); if (rc != 0) { LOGERR("pthread_cond_init failed for request " "manager thread app_id=%d client_id=%d rc=%d (%s)", app_id, client_id, rc, strerror(rc)); pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); - free(thrd_ctrl->del_req_stat->req_stat); - free(thrd_ctrl->del_req_stat); - free(thrd_ctrl->del_req_set); + free(thrd_ctrl); + return NULL; + } + + /* create the argobots mutex for synchronizing access to reqs state */ + ABT_mutex_create(&(thrd_ctrl->reqs_sync)); + + /* allocate a list to track client rpc requests */ + thrd_ctrl->client_reqs = arraylist_create(); + if (thrd_ctrl->client_reqs == NULL) { + LOGERR("failed to allocate request manager client_reqs!"); + pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); free(thrd_ctrl); return NULL; } @@ -184,36 +146,18 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) /* initialize flow control flags */ thrd_ctrl->exit_flag = 0; thrd_ctrl->exited = 0; - thrd_ctrl->has_waiting_delegator = 0; + thrd_ctrl->waiting_for_work = 0; thrd_ctrl->has_waiting_dispatcher = 0; - /* insert our thread control structure into our list of - * active request manager threads, important to do this before - * launching thread since it uses list to lookup its structure */ - rc = arraylist_add(rm_thrd_list, thrd_ctrl); - if (rc != 0) { - pthread_cond_destroy(&(thrd_ctrl->thrd_cond)); - pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); - free(thrd_ctrl->del_req_stat->req_stat); - free(thrd_ctrl->del_req_stat); - free(thrd_ctrl->del_req_set); - free(thrd_ctrl); - return NULL; - } - thrd_ctrl->thrd_ndx = arraylist_size(rm_thrd_list) - 1; - /* launch request manager thread */ rc = pthread_create(&(thrd_ctrl->thrd), NULL, - rm_delegate_request_thread, (void*)thrd_ctrl); + request_manager_thread, (void*)thrd_ctrl); if (rc != 0) { LOGERR("failed to create request manager thread for " "app_id=%d client_id=%d - rc=%d (%s)", app_id, client_id, rc, strerror(rc)); pthread_cond_destroy(&(thrd_ctrl->thrd_cond)); pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); - free(thrd_ctrl->del_req_stat->req_stat); - free(thrd_ctrl->del_req_stat); - free(thrd_ctrl->del_req_set); free(thrd_ctrl); return NULL; } @@ -221,190 +165,41 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) return thrd_ctrl; } -/* Lookup RM thread control structure */ -reqmgr_thrd_t* rm_get_thread(int thrd_id) -{ - return (reqmgr_thrd_t*) arraylist_get(rm_thrd_list, thrd_id); -} - -static void print_send_msgs(send_msg_t* send_metas, - int msg_cnt) -{ - int i; - send_msg_t* msg; - for (i = 0; i < msg_cnt; i++) { - msg = send_metas + i; - LOGDBG("msg[%d] gfid:%d length:%zu file_offset:%zu " - "dest_offset:%zu dest_app:%d dest_clid:%d", - i, msg->src_fid, msg->length, msg->src_offset, - msg->dest_offset, msg->dest_app_id, msg->dest_client_id); - } -} - -static void print_remote_del_reqs(int app_id, int cli_id, - del_req_stat_t* del_req_stat) -{ - int i; - for (i = 0; i < del_req_stat->del_cnt; i++) { - LOGDBG("remote_delegator:%d, req_cnt:%d", - del_req_stat->req_stat[i].del_id, - del_req_stat->req_stat[i].req_cnt); - } -} - -#if 0 // NOT CURRENTLY USED -static void print_recv_msg(int app_id, int cli_id, - int thrd_id, - shm_meta_t* msg) -{ - LOGDBG("recv_msg: app_id:%d, cli_id:%d, thrd_id:%d, " - "fid:%d, offset:%ld, len:%ld", - app_id, cli_id, thrd_id, msg->src_fid, - msg->offset, msg->length); -} -#endif - - -/* order keyvals by gfid, then host delegator rank */ -static int compare_kv_gfid_rank(const void* a, const void* b) -{ - const unifyfs_keyval_t* kv_a = a; - const unifyfs_keyval_t* kv_b = b; - - int gfid_a = kv_a->key.fid; - int gfid_b = kv_b->key.fid; - if (gfid_a == gfid_b) { - int rank_a = kv_a->val.delegator_rank; - int rank_b = kv_b->val.delegator_rank; - if (rank_a == rank_b) { - return 0; - } else if (rank_a < rank_b) { - return -1; - } else { - return 1; - } - } else if (gfid_a < gfid_b) { - return -1; - } else { - return 1; - } -} - -/* order read requests by destination delegator rank */ -static int compare_msg_delegators(const void* a, const void* b) -{ - const send_msg_t* msg_a = a; - const send_msg_t* msg_b = b; - int rank_a = msg_a->dest_delegator_rank; - int rank_b = msg_b->dest_delegator_rank; - - if (rank_a == rank_b) { - return 0; - } else if (rank_a < rank_b) { - return -1; - } else { - return 1; - } -} - -unifyfs_key_t** alloc_key_array(int elems) -{ - int size = elems * (sizeof(unifyfs_key_t*) + sizeof(unifyfs_key_t)); - - void* mem_block = calloc(size, sizeof(char)); - - unifyfs_key_t** array_ptr = mem_block; - unifyfs_key_t* key_ptr = (unifyfs_key_t*)(array_ptr + elems); - - for (int i = 0; i < elems; i++) { - array_ptr[i] = &key_ptr[i]; - } - - return (unifyfs_key_t**)mem_block; -} - -fattr_key_t** alloc_attr_key_array(int elems) -{ - int size = elems * (sizeof(fattr_key_t*) + sizeof(fattr_key_t)); - - void* mem_block = calloc(size, sizeof(char)); - - fattr_key_t** array_ptr = mem_block; - fattr_key_t* key_ptr = (fattr_key_t*)(array_ptr + elems); - - for (int i = 0; i < elems; i++) { - array_ptr[i] = &key_ptr[i]; - } - - return (fattr_key_t**)mem_block; -} - -unifyfs_val_t** alloc_value_array(int elems) -{ - int size = elems * (sizeof(unifyfs_val_t*) + sizeof(unifyfs_val_t)); - - void* mem_block = calloc(size, sizeof(char)); - - unifyfs_val_t** array_ptr = mem_block; - unifyfs_val_t* key_ptr = (unifyfs_val_t*)(array_ptr + elems); - - for (int i = 0; i < elems; i++) { - array_ptr[i] = &key_ptr[i]; - } - - return (unifyfs_val_t**)mem_block; -} - -void free_key_array(unifyfs_key_t** array) -{ - free(array); -} - -void free_value_array(unifyfs_val_t** array) -{ - free(array); -} - -void free_attr_key_array(fattr_key_t** array) -{ - free(array); -} - static void debug_print_read_req(server_read_req_t* req) { if (NULL != req) { - LOGDBG("server_read_req[%d] status=%d, gfid=%d, num_remote=%d", - req->req_ndx, req->status, req->extent.gfid, - req->num_remote_reads); + LOGDBG("server_read_req[%d] status=%d, num_remote=%d", + req->req_ndx, req->status, req->num_server_reads); } } -static server_read_req_t* reserve_read_req(reqmgr_thrd_t* thrd_ctrl) +server_read_req_t* rm_reserve_read_req(reqmgr_thrd_t* thrd_ctrl) { server_read_req_t* rdreq = NULL; - RM_LOCK(thrd_ctrl); + RM_REQ_LOCK(thrd_ctrl); if (thrd_ctrl->num_read_reqs < RM_MAX_ACTIVE_REQUESTS) { if (thrd_ctrl->next_rdreq_ndx < (RM_MAX_ACTIVE_REQUESTS - 1)) { rdreq = thrd_ctrl->read_reqs + thrd_ctrl->next_rdreq_ndx; - assert((rdreq->req_ndx == 0) && (rdreq->extent.gfid == 0)); + assert((rdreq->req_ndx == 0) && (rdreq->in_use == 0)); rdreq->req_ndx = thrd_ctrl->next_rdreq_ndx++; } else { // search for unused slot for (int i = 0; i < RM_MAX_ACTIVE_REQUESTS; i++) { rdreq = thrd_ctrl->read_reqs + i; - if ((rdreq->req_ndx == 0) && (rdreq->extent.gfid == 0)) { + if ((rdreq->req_ndx == 0) && (rdreq->in_use == 0)) { rdreq->req_ndx = i; break; } } } thrd_ctrl->num_read_reqs++; + rdreq->in_use = 1; LOGDBG("reserved read req %d (active=%d, next=%d)", rdreq->req_ndx, thrd_ctrl->num_read_reqs, thrd_ctrl->next_rdreq_ndx); debug_print_read_req(rdreq); } else { LOGERR("maxed-out request manager read_reqs array!!"); } - RM_UNLOCK(thrd_ctrl); + RM_REQ_UNLOCK(thrd_ctrl); return rdreq; } @@ -412,8 +207,9 @@ static int release_read_req(reqmgr_thrd_t* thrd_ctrl, server_read_req_t* rdreq) { int rc = (int)UNIFYFS_SUCCESS; - RM_LOCK(thrd_ctrl); + if (rdreq != NULL) { + RM_REQ_LOCK(thrd_ctrl); LOGDBG("releasing read req %d", rdreq->req_ndx); if (rdreq->req_ndx == (thrd_ctrl->next_rdreq_ndx - 1)) { thrd_ctrl->next_rdreq_ndx--; @@ -428,561 +224,296 @@ static int release_read_req(reqmgr_thrd_t* thrd_ctrl, thrd_ctrl->num_read_reqs--; LOGDBG("after release (active=%d, next=%d)", thrd_ctrl->num_read_reqs, thrd_ctrl->next_rdreq_ndx); - debug_print_read_req(rdreq); + RM_REQ_UNLOCK(thrd_ctrl); } else { - rc = UNIFYFS_ERROR_INVAL; + rc = EINVAL; LOGERR("NULL read_req"); } - RM_UNLOCK(thrd_ctrl); + return rc; } -static void signal_new_requests(reqmgr_thrd_t* thrd_ctrl) +int rm_release_read_req(reqmgr_thrd_t* thrd_ctrl, + server_read_req_t* rdreq) { - // NOTE: this fn assumes thrd_ctrl->thrd_lock is locked - - /* wake up the request manager thread for the requesting client */ - if (!thrd_ctrl->has_waiting_delegator) { - /* delegator thread is not waiting, but we are in critical - * section, we just added requests so we must wait for delegator - * to signal us that it's reached the critical section before - * we escape so we don't overwrite these requests before it - * has had a chance to process them */ - thrd_ctrl->has_waiting_dispatcher = 1; - pthread_cond_wait(&thrd_ctrl->thrd_cond, &thrd_ctrl->thrd_lock); - - /* delegator thread has signaled us that it's now waiting */ - thrd_ctrl->has_waiting_dispatcher = 0; - } - /* have a delegator thread waiting on condition variable, - * signal it to begin processing the requests we just added */ - pthread_cond_signal(&thrd_ctrl->thrd_cond); + return release_read_req(thrd_ctrl, rdreq); } -static void signal_new_responses(reqmgr_thrd_t* thrd_ctrl) +static void signal_new_requests(reqmgr_thrd_t* reqmgr) { - // NOTE: this fn assumes thrd_ctrl->thrd_lock is locked - - /* wake up the request manager thread */ - if (thrd_ctrl->has_waiting_delegator) { - /* have a delegator thread waiting on condition variable, - * signal it to begin processing the responses we just added */ - pthread_cond_signal(&thrd_ctrl->thrd_cond); + RM_LOCK(reqmgr); + pid_t this_thread = unifyfs_gettid(); + if (this_thread != reqmgr->tid) { + /* wake up the request manager thread for the requesting client */ + if (!reqmgr->waiting_for_work) { + /* reqmgr thread is not waiting, but we are in critical + * section, we just added requests so we must wait for reqmgr + * to signal us that it's reached the critical section before + * we escape so we don't overwrite these requests before it + * has had a chance to process them */ + reqmgr->has_waiting_dispatcher = 1; + pthread_cond_wait(&reqmgr->thrd_cond, &reqmgr->thrd_lock); + + /* reqmgr thread has signaled us that it's now waiting */ + reqmgr->has_waiting_dispatcher = 0; + } + /* have a reqmgr thread waiting on condition variable, + * signal it to begin processing the requests we just added */ + pthread_cond_signal(&reqmgr->thrd_cond); } + RM_UNLOCK(reqmgr); } -/* issue remote chunk read requests for extent chunks - * contained within keyvals - */ -int create_request_messages(reqmgr_thrd_t* thrd_ctrl, - int client_rank, - int num_vals, - unifyfs_keyval_t* keyvals) +static void signal_new_responses(reqmgr_thrd_t* reqmgr) { - int thrd_id = thrd_ctrl->thrd_ndx; - int app_id = thrd_ctrl->app_id; - int client_id = thrd_ctrl->client_id; - - /* wait for lock for shared data structures holding requests - * and condition variable */ - RM_LOCK(thrd_ctrl); - - // set up the thread_control delegator request set - // TODO: make this a function - int i; - for (i = 0; i < num_vals; i++) { - send_msg_t* meta = &(thrd_ctrl->del_req_set->msg_meta[i]); - memset(meta, 0, sizeof(send_msg_t)); - - debug_log_key_val(__func__, &keyvals[i].key, &keyvals[i].val); - - /* physical offset of the requested file segment on the log file */ - meta->dest_offset = keyvals[i].val.addr; - - /* rank of the remote delegator */ - meta->dest_delegator_rank = keyvals[i].val.delegator_rank; - - /* dest_client_id and dest_app_id uniquely identify the remote - * physical log file that contains the requested segments */ - meta->dest_app_id = keyvals[i].val.app_id; - meta->dest_client_id = keyvals[i].val.rank; - meta->length = (size_t)keyvals[i].val.len; - - /* src_app_id and src_cli_id identifies the requested client */ - meta->src_app_id = app_id; - meta->src_cli_id = client_id; - - /* src_offset is the logical offset of the shared file */ - meta->src_offset = keyvals[i].key.offset; - meta->src_delegator_rank = glb_mpi_rank; - meta->src_fid = keyvals[i].key.fid; - meta->src_dbg_rank = client_rank; - meta->src_thrd = thrd_id; - } - - thrd_ctrl->del_req_set->num = num_vals; - - if (num_vals > 1) { - /* sort read requests to be sent to the same delegators. */ - qsort(thrd_ctrl->del_req_set->msg_meta, - thrd_ctrl->del_req_set->num, - sizeof(send_msg_t), compare_msg_delegators); - } - - /* debug print */ - print_send_msgs(thrd_ctrl->del_req_set->msg_meta, - thrd_ctrl->del_req_set->num); - - /* get pointer to list of delegator stat objects to record - * delegator rank and count of requests for each delegator */ - per_del_stat_t* req_stat = thrd_ctrl->del_req_stat->req_stat; - - /* get pointer to send message structures, one for each request */ - send_msg_t* msg_meta = thrd_ctrl->del_req_set->msg_meta; - - /* iterate over read requests and count number of requests - * to be sent to each delegator */ - int del_ndx = 0; - req_stat[del_ndx].del_id = msg_meta[0].dest_delegator_rank; - req_stat[del_ndx].req_cnt = 1; - - for (i = 1; i < thrd_ctrl->del_req_set->num; i++) { - int cur_rank = msg_meta[i].dest_delegator_rank; - int prev_rank = msg_meta[i-1].dest_delegator_rank; - if (cur_rank == prev_rank) { - /* another message for the current delegator */ - req_stat[del_ndx].req_cnt++; - } else { - /* new delegator */ - del_ndx++; - req_stat[del_ndx].del_id = msg_meta[i].dest_delegator_rank; - req_stat[del_ndx].req_cnt = 1; + RM_LOCK(reqmgr); + pid_t this_thread = unifyfs_gettid(); + if (this_thread != reqmgr->tid) { + /* wake up the request manager thread */ + if (reqmgr->waiting_for_work) { + /* have a reqmgr thread waiting on condition variable, + * signal it to begin processing the responses we just added */ + pthread_cond_signal(&reqmgr->thrd_cond); } } - - /* record total number of delegators we'll send requests to */ - thrd_ctrl->del_req_stat->del_cnt = del_ndx + 1; - - /* debug print */ - print_remote_del_reqs(app_id, thrd_id, thrd_ctrl->del_req_stat); - - /* wake up the request manager thread for the requesting client */ - signal_new_requests(thrd_ctrl); - - /* done updating shared variables, release the lock */ - RM_UNLOCK(thrd_ctrl); - - return UNIFYFS_SUCCESS; + RM_UNLOCK(reqmgr); } /* issue remote chunk read requests for extent chunks * listed within keyvals */ -int create_chunk_requests(reqmgr_thrd_t* thrd_ctrl, - server_read_req_t* rdreq, - int num_vals, - unifyfs_keyval_t* keyvals) +int rm_create_chunk_requests(reqmgr_thrd_t* thrd_ctrl, + server_read_req_t* rdreq, + int num_vals, + unifyfs_keyval_t* keyvals) { - int thrd_id = thrd_ctrl->thrd_ndx; - int app_id = thrd_ctrl->app_id; - int client_id = thrd_ctrl->client_id; + LOGDBG("creating chunk requests for rdreq %d", rdreq->req_ndx); + /* allocate read request structures */ chunk_read_req_t* all_chunk_reads = (chunk_read_req_t*) calloc((size_t)num_vals, sizeof(chunk_read_req_t)); if (NULL == all_chunk_reads) { LOGERR("failed to allocate chunk-reads array"); - return UNIFYFS_ERROR_NOMEM; + return ENOMEM; } - - RM_LOCK(thrd_ctrl); - - LOGDBG("creating chunk requests for rdreq %d", rdreq->req_ndx); - rdreq->chunks = all_chunk_reads; - int i, curr_del; + /* iterate over write index values and create read requests + * for each one, also count up number of servers that we'll + * forward read requests to */ + int i; int prev_del = -1; - int del_ndx = 0; - chunk_read_req_t* chk_read; + int num_del = 0; for (i = 0; i < num_vals; i++) { - /* count the delegators */ - curr_del = keyvals[i].val.delegator_rank; - if ((prev_del != -1) && (curr_del != prev_del)) { - del_ndx++; + /* get target server for this request */ + int curr_del = keyvals[i].val.delegator_rank; + + /* if target server is different from last target, + * increment our server count */ + if ((prev_del == -1) || (curr_del != prev_del)) { + num_del++; } prev_del = curr_del; - /* create chunk-reads */ + /* get pointer to next read request structure */ debug_log_key_val(__func__, &keyvals[i].key, &keyvals[i].val); - chk_read = all_chunk_reads + i; - chk_read->nbytes = keyvals[i].val.len; - chk_read->offset = keyvals[i].key.offset; - chk_read->log_offset = keyvals[i].val.addr; - chk_read->log_app_id = keyvals[i].val.app_id; - chk_read->log_client_id = keyvals[i].val.rank; + chunk_read_req_t* chk = all_chunk_reads + i; + + /* fill in chunk read request */ + chk->gfid = keyvals[i].key.gfid; + chk->nbytes = keyvals[i].val.len; + chk->offset = keyvals[i].key.offset; + chk->log_offset = keyvals[i].val.addr; + chk->log_app_id = keyvals[i].val.app_id; + chk->log_client_id = keyvals[i].val.rank; } /* allocate per-delgator chunk-reads */ - int num_dels = del_ndx + 1; - rdreq->num_remote_reads = num_dels; - rdreq->remote_reads = (remote_chunk_reads_t*) - calloc((size_t)num_dels, sizeof(remote_chunk_reads_t)); + int num_dels = num_del; + rdreq->num_server_reads = num_dels; + rdreq->remote_reads = (server_chunk_reads_t*) + calloc((size_t)num_dels, sizeof(server_chunk_reads_t)); if (NULL == rdreq->remote_reads) { LOGERR("failed to allocate remote-reads array"); - RM_UNLOCK(thrd_ctrl); - return UNIFYFS_ERROR_NOMEM; + return ENOMEM; } - /* populate per-delegator chunk-reads info */ - size_t del_data_sz = 0; - remote_chunk_reads_t* del_reads; + /* get pointer to start of chunk read request array */ + server_chunk_reads_t* reads = rdreq->remote_reads; + + /* iterate over write index values again and now create + * per-server chunk-reads info, for each server + * that we'll request data from, this totals up the number + * of read requests and total read data size from that + * server */ prev_del = -1; - del_ndx = 0; + size_t del_data_sz = 0; for (i = 0; i < num_vals; i++) { - curr_del = keyvals[i].val.delegator_rank; + /* get target server for this request */ + int curr_del = keyvals[i].val.delegator_rank; + + /* if target server is different from last target, + * close out the total number of bytes for the last + * server, note this assumes our write index values are + * sorted by server rank */ if ((prev_del != -1) && (curr_del != prev_del)) { - /* record total data for previous delegator */ - del_reads = rdreq->remote_reads + del_ndx; - del_reads->total_sz = del_data_sz; - /* advance to next delegator */ - del_ndx++; + /* record total data for previous server */ + reads->total_sz = del_data_sz; + + /* advance to read request for next server */ + reads += 1; + + /* reset our running tally of bytes to 0 */ del_data_sz = 0; } prev_del = curr_del; - /* update total data size for current delegator */ + /* update total read data size for current server */ del_data_sz += keyvals[i].val.len; - del_reads = rdreq->remote_reads + del_ndx; - if (0 == del_reads->num_chunks) { - /* initialize structure */ - del_reads->rank = curr_del; - del_reads->rdreq_id = rdreq->req_ndx; - del_reads->reqs = all_chunk_reads + i; - del_reads->resp = NULL; + /* if this is the first read request for this server, + * initialize fields on the per-server read request + * structure */ + if (0 == reads->num_chunks) { + /* TODO: let's describe what these fields are for */ + reads->rank = curr_del; + reads->rdreq_id = rdreq->req_ndx; + reads->reqs = all_chunk_reads + i; + reads->resp = NULL; } - del_reads->num_chunks++; + + /* increment number of read requests we're sending + * to this server */ + reads->num_chunks++; + } + + /* record total data size for final server (if any), + * would have missed doing this in the above loop */ + if (num_vals > 0) { + reads->total_sz = del_data_sz; } - del_reads = rdreq->remote_reads + del_ndx; - del_reads->total_sz = del_data_sz; + + /* mark request as ready to be started */ + rdreq->status = READREQ_READY; /* wake up the request manager thread for the requesting client */ signal_new_requests(thrd_ctrl); - RM_UNLOCK(thrd_ctrl); return UNIFYFS_SUCCESS; } -/************************ - * These functions are called by the rpc handler to assign work - * to the request manager thread - ***********************/ - -/* given an app_id, client_id and global file id, - * compute and return file size for specified file - */ -int rm_cmd_filesize( - int app_id, /* app_id for requesting client */ - int client_id, /* client_id for requesting client */ - int gfid, /* global file id of read request */ - size_t* outsize) /* output file size */ +int rm_submit_read_request(server_read_req_t* req) { - /* set offset and length to request *all* key/value pairs - * for this file */ - size_t offset = 0; + int ret = UNIFYFS_SUCCESS; + int i = 0; + app_client* client = NULL; + reqmgr_thrd_t* thrd_ctrl = NULL; + server_read_req_t* rdreq = NULL; + + if (!req || !req->chunks || !req->remote_reads) { + return EINVAL; + } - /* want to pick the highest integer offset value a file - * could have here */ - // TODO: would like to unsed max for unsigned long, but - // that fails to return any keys for some reason - size_t length = (SIZE_MAX >> 1) - 1; - - /* get the locations of all the read requests from the - * key-value store*/ - unifyfs_key_t key1, key2; - - /* create key to describe first byte we'll read */ - key1.fid = gfid; - key1.offset = offset; - - /* create key to describe last byte we'll read */ - key2.fid = gfid; - key2.offset = offset + length - 1; - - unifyfs_keyval_t* keyvals; - unifyfs_key_t* unifyfs_keys[2] = {&key1, &key2}; - int key_lens[2] = {sizeof(unifyfs_key_t), sizeof(unifyfs_key_t)}; - - /* look up all entries in this range */ - int num_vals = 0; - keyvals = (unifyfs_keyval_t*) calloc(UNIFYFS_MAX_SPLIT_CNT, - sizeof(unifyfs_keyval_t)); - if (NULL == keyvals) { - LOGERR("failed to allocate keyvals"); - return UNIFYFS_ERROR_NOMEM; - } - - int rc = unifyfs_get_file_extents(2, unifyfs_keys, key_lens, - &num_vals, &keyvals); - /* TODO: if there are file extents not accounted for we should - * either return 0 for that date (holes) or EOF if reading past - * the end of the file */ - if (UNIFYFS_SUCCESS != rc) { - // we need to let the client know that there was an error - free(keyvals); + client = get_app_client(req->app_id, req->client_id); + if (NULL == client) { return UNIFYFS_FAILURE; } - /* compute our file size by iterating over each file - * segment and taking the max logical offset */ - int i; - size_t filesize = 0; - for (i = 0; i < num_vals; i++) { - /* get pointer to next key value pair */ - unifyfs_keyval_t* kv = &keyvals[i]; + thrd_ctrl = client->reqmgr; + + rdreq = rm_reserve_read_req(thrd_ctrl); + if (!rdreq) { + LOGERR("failed to allocate a request"); + return UNIFYFS_FAILURE; + } - /* get last byte offset for this segment of the file */ - size_t last_offset = kv->key.offset + kv->val.len; + rdreq->app_id = req->app_id; + rdreq->client_id = req->client_id; + rdreq->num_server_reads = req->num_server_reads; + rdreq->chunks = req->chunks; + rdreq->remote_reads = req->remote_reads; - /* update our filesize if this offset is bigger than the current max */ - if (last_offset > filesize) { - filesize = last_offset; - } + for (i = 0; i < rdreq->num_server_reads; i++) { + server_chunk_reads_t* read = &rdreq->remote_reads[i]; + read->rdreq_id = rdreq->req_ndx; } - // cleanup - free(keyvals); + rdreq->status = READREQ_READY; + signal_new_requests(thrd_ctrl); - *outsize = filesize; - return rc; + return ret; } -int create_gfid_chunk_reads(reqmgr_thrd_t* thrd_ctrl, - int gfid, int app_id, int client_id, - int num_keys, unifyfs_key_t** keys, int* keylens) +/* signal the client process for it to start processing read + * data in shared memory */ +static int client_signal(shm_data_header* hdr, + shm_data_state_e flag) { - // TODO: might want to get this from a memory pool - unifyfs_keyval_t* keyvals = calloc(UNIFYFS_MAX_SPLIT_CNT, - sizeof(unifyfs_keyval_t)); - if (NULL == keyvals) { - LOGERR("failed to allocate keyvals"); - return UNIFYFS_ERROR_NOMEM; - } - - int num_vals = 0; - int rc = unifyfs_get_file_extents(num_keys, keys, keylens, - &num_vals, &keyvals); - /* TODO: if there are file extents not accounted for we should - * either return 0 for that data (holes) or EOF if reading past - * the end of the file */ - if (UNIFYFS_SUCCESS != rc || num_vals == 0) { - /* failed to find any key / value pairs */ - rc = UNIFYFS_FAILURE; - } else { - if (num_vals > 1) { - /* sort keyvals by delegator */ - qsort(keyvals, (size_t)num_vals, sizeof(unifyfs_keyval_t), - compare_kv_gfid_rank); - } - server_read_req_t* rdreq = reserve_read_req(thrd_ctrl); - if (NULL == rdreq) { - rc = UNIFYFS_FAILURE; - } else { - rdreq->app_id = app_id; - rdreq->client_id = client_id; - rdreq->extent.gfid = gfid; - rdreq->extent.errcode = EINPROGRESS; - rc = create_chunk_requests(thrd_ctrl, rdreq, - num_vals, keyvals); - if (rc != (int)UNIFYFS_SUCCESS) { - release_read_req(thrd_ctrl, rdreq); - } - } + if (flag == SHMEM_REGION_DATA_READY) { + LOGDBG("setting data-ready"); + } else if (flag == SHMEM_REGION_DATA_COMPLETE) { + LOGDBG("setting data-complete"); } - // cleanup - free(keyvals); + /* we signal the client by setting a flag value within + * a shared memory segment that the client is monitoring */ + hdr->state = flag; - return rc; -} + /* TODO: MEM_FLUSH */ -/* read function for one requested extent, - * called from rpc handler to fill shared data structures - * with read requests to be handled by the delegator thread - * returns before requests are handled - */ -int rm_cmd_read( - int app_id, /* app_id for requesting client */ - int client_id, /* client_id for requesting client */ - int gfid, /* global file id of read request */ - size_t offset, /* logical file offset of read request */ - size_t length) /* number of bytes to read */ -{ - /* get pointer to app structure for this app id */ - app_config_t* app_config = - (app_config_t*)arraylist_get(app_config_list, app_id); - - /* get thread id for this client */ - int thrd_id = app_config->thrd_idxs[client_id]; - - /* look up thread control structure */ - reqmgr_thrd_t* thrd_ctrl = rm_get_thread(thrd_id); - - /* get chunks corresponding to requested client read extent - * - * Generate a pair of keys for the read request, representing the start - * and end offset. MDHIM returns all key-value pairs that fall within - * the offset range. - * - * TODO: this is specific to the MDHIM in the source tree and not portable - * to other KV-stores. This needs to be revisited to utilize some - * other mechanism to retrieve all relevant key-value pairs from the - * KV-store. - */ - unifyfs_key_t key1, key2; - - /* create key to describe first byte we'll read */ - key1.fid = gfid; - key1.offset = offset; - - /* create key to describe last byte we'll read */ - key2.fid = gfid; - key2.offset = offset + length - 1; - - unifyfs_key_t* unifyfs_keys[2] = {&key1, &key2}; - int key_lens[2] = {sizeof(unifyfs_key_t), sizeof(unifyfs_key_t)}; - - return create_gfid_chunk_reads(thrd_ctrl, gfid, app_id, client_id, - 2, unifyfs_keys, key_lens); + return UNIFYFS_SUCCESS; } -/* send the read requests to the remote delegators - * - * @param app_id: application id - * @param client_id: client id for requesting process - * @param gfid: global file id - * @param req_num: number of read requests - * @param reqbuf: read requests buffer - * @return success/error code */ -int rm_cmd_mread(int app_id, int client_id, - size_t req_num, void* reqbuf) +/* wait until client has processed all read data in shared memory */ +static int client_wait(shm_data_header* hdr) { - /* get pointer to app structure for this app id */ - app_config_t* app_config = - (app_config_t*)arraylist_get(app_config_list, app_id); - - /* get thread id for this client */ - int thrd_id = app_config->thrd_idxs[client_id]; - - /* look up thread control structure */ - reqmgr_thrd_t* thrd_ctrl = rm_get_thread(thrd_id); - - /* get debug rank for this client */ - int cli_rank = app_config->dbg_ranks[client_id]; - - /* get the locations of all the read requests from the key-value store */ - unifyfs_ReadRequest_table_t readRequest = - unifyfs_ReadRequest_as_root(reqbuf); - unifyfs_Extent_vec_t extents = unifyfs_ReadRequest_extents(readRequest); - size_t extents_len = unifyfs_Extent_vec_len(extents); - assert(extents_len == req_num); - - // allocate key storage - unifyfs_key_t** unifyfs_keys; - int* key_lens; - size_t key_cnt = req_num * 2; - unifyfs_keys = alloc_key_array(key_cnt); - key_lens = (int*) calloc(key_cnt, sizeof(int)); - if ((NULL == unifyfs_keys) || - (NULL == key_lens)) { - // this is a fatal error - // TODO: we need better error handling - LOGERR("Error allocating buffers"); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* get chunks corresponding to requested client read extents */ - int rc, num_keys; - int fid = -1; - int last_fid = -1; - int ndx = 0; - size_t j, eoff, elen; - for (j = 0; j < req_num; j++) { - fid = unifyfs_Extent_fid(unifyfs_Extent_vec_at(extents, j)); - if (j && (fid != last_fid)) { - // create requests for all extents of last_fid - num_keys = ndx; - rc = create_gfid_chunk_reads(thrd_ctrl, last_fid, app_id, - client_id, num_keys, - unifyfs_keys, key_lens); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("Error creating chunk reads for gfid=%d", last_fid); - } - // reset ndx for current fid - ndx = 0; - } - - eoff = unifyfs_Extent_offset(unifyfs_Extent_vec_at(extents, j)); - elen = unifyfs_Extent_length(unifyfs_Extent_vec_at(extents, j)); - LOGDBG("gfid:%d, offset:%zu, length:%zu", fid, eoff, elen); - - /* Generate a pair of keys for each read request, representing - * the start and end offsets. MDHIM returns all key-value pairs that - * fall within the offset range. - * - * TODO: this is specific to the MDHIM in the source tree and not - * portable to other KV-stores. This needs to be revisited to - * utilize some other mechanism to retrieve all relevant KV - * pairs from the KV-store. - */ - key_lens[ndx] = sizeof(unifyfs_key_t); - key_lens[ndx + 1] = sizeof(unifyfs_key_t); + int rc = (int)UNIFYFS_SUCCESS; - /* create key to describe first byte we'll read */ - unifyfs_keys[ndx]->fid = fid; - unifyfs_keys[ndx]->offset = eoff; + /* specify time to sleep between checking flag in shared + * memory indicating client has processed data */ + struct timespec shm_wait_tm; + shm_wait_tm.tv_sec = 0; + shm_wait_tm.tv_nsec = SHM_WAIT_INTERVAL; - /* create key to describe last byte we'll read */ - unifyfs_keys[ndx + 1]->fid = fid; - unifyfs_keys[ndx + 1]->offset = eoff + elen - 1; + /* wait for client to set flag to 0 */ + int max_sleep = 10000000; // 10s + volatile int* vip = (volatile int*)&(hdr->state); + while (*vip != SHMEM_REGION_EMPTY) { + /* not there yet, sleep for a while */ + nanosleep(&shm_wait_tm, NULL); - ndx += 2; - last_fid = fid; - } + /* TODO: MEM_FETCH */ - // create requests for all extents of last_fid - num_keys = ndx; - rc = create_gfid_chunk_reads(thrd_ctrl, last_fid, app_id, - client_id, num_keys, - unifyfs_keys, key_lens); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("Error creating chunk reads for gfid=%d", last_fid); + max_sleep--; + if (0 == max_sleep) { + LOGERR("timed out waiting for empty"); + rc = (int)UNIFYFS_ERROR_SHMEM; + break; + } } - // cleanup - free_key_array(unifyfs_keys); - free(key_lens); - + /* reset header to reflect empty state */ + hdr->meta_cnt = 0; + hdr->bytes = 0; return rc; } /* function called by main thread to instruct * resource manager thread to exit, * returns UNIFYFS_SUCCESS on success */ -int rm_cmd_exit(reqmgr_thrd_t* thrd_ctrl) +int rm_request_exit(reqmgr_thrd_t* thrd_ctrl) { - /* grab the lock */ - RM_LOCK(thrd_ctrl); - if (thrd_ctrl->exited) { /* already done */ - RM_UNLOCK(thrd_ctrl); return UNIFYFS_SUCCESS; } - /* if delegator thread is not waiting in critical + /* grab the lock */ + RM_LOCK(thrd_ctrl); + + /* if reqmgr thread is not waiting in critical * section, let's wait on it to come back */ - if (!thrd_ctrl->has_waiting_delegator) { - /* delegator thread is not in critical section, + if (!thrd_ctrl->waiting_for_work) { + /* reqmgr thread is not in critical section, * tell it we've got something and signal it */ thrd_ctrl->has_waiting_dispatcher = 1; pthread_cond_wait(&thrd_ctrl->thrd_cond, &thrd_ctrl->thrd_lock); @@ -991,270 +522,49 @@ int rm_cmd_exit(reqmgr_thrd_t* thrd_ctrl) thrd_ctrl->has_waiting_dispatcher = 0; } - /* inform delegator thread that it's time to exit */ + /* inform reqmgr thread that it's time to exit */ thrd_ctrl->exit_flag = 1; - /* signal delegator thread */ + /* signal reqmgr thread */ pthread_cond_signal(&thrd_ctrl->thrd_cond); /* release the lock */ RM_UNLOCK(thrd_ctrl); - /* wait for delegator thread to exit */ - void* status; - pthread_join(thrd_ctrl->thrd, &status); - thrd_ctrl->exited = 1; - - /* free storage holding shared data structures */ - free(thrd_ctrl->del_req_set); - free(thrd_ctrl->del_req_stat->req_stat); - free(thrd_ctrl->del_req_stat); - - return UNIFYFS_SUCCESS; -} - -/* - * synchronize all the indices and file attributes - * to the key-value store - * - * @param app_id: the application id - * @param client_side_id: client rank in app - * @param gfid: global file id - * @return success/error code - */ -int rm_cmd_fsync(int app_id, int client_side_id, int gfid) -{ - int ret = 0; - - unifyfs_key_t** unifyfs_keys; - unifyfs_val_t** unifyfs_vals; - int* unifyfs_key_lens = NULL; - int* unifyfs_val_lens = NULL; - - fattr_key_t** fattr_keys = NULL; - unifyfs_file_attr_t** fattr_vals = NULL; - int* fattr_key_lens = NULL; - int* fattr_val_lens = NULL; - size_t i, attr_num_entries, extent_num_entries; - - app_config_t* app_config = (app_config_t*) - arraylist_get(app_config_list, app_id); - - extent_num_entries = *(size_t*) - (app_config->shm_superblocks[client_side_id] - + app_config->meta_offset); - - /* - * indices are stored in the superblock shared memory - * created by the client - */ - int page_sz = getpagesize(); - unifyfs_index_t* meta_payload = (unifyfs_index_t*) - (app_config->shm_superblocks[client_side_id] - + app_config->meta_offset + page_sz); - - // allocate storage for values - // TODO: possibly get this from memory pool - unifyfs_keys = alloc_key_array(extent_num_entries); - unifyfs_vals = alloc_value_array(extent_num_entries); - unifyfs_key_lens = calloc(extent_num_entries, sizeof(int)); - unifyfs_val_lens = calloc(extent_num_entries, sizeof(int)); - if ((NULL == unifyfs_keys) || - (NULL == unifyfs_vals) || - (NULL == unifyfs_key_lens) || - (NULL == unifyfs_val_lens)) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - // file extents - for (i = 0; i < extent_num_entries; i++) { - unifyfs_keys[i]->fid = meta_payload[i].fid; - unifyfs_keys[i]->offset = meta_payload[i].file_pos; - - unifyfs_vals[i]->addr = meta_payload[i].mem_pos; - unifyfs_vals[i]->len = meta_payload[i].length; - unifyfs_vals[i]->delegator_rank = glb_mpi_rank; - unifyfs_vals[i]->app_id = app_id; - unifyfs_vals[i]->rank = client_side_id; - - LOGDBG("extent - fid:%d, offset:%zu, length:%zu, app:%d, clid:%d", - unifyfs_keys[i]->fid, unifyfs_keys[i]->offset, - unifyfs_vals[i]->len, unifyfs_vals[i]->app_id, - unifyfs_vals[i]->rank); - - unifyfs_key_lens[i] = sizeof(unifyfs_key_t); - unifyfs_val_lens[i] = sizeof(unifyfs_val_t); - } - - ret = unifyfs_set_file_extents((int)extent_num_entries, - unifyfs_keys, unifyfs_key_lens, - unifyfs_vals, unifyfs_val_lens); - if (ret != UNIFYFS_SUCCESS) { - // TODO: need proper error handling - LOGERR("unifyfs_set_file_extents() failed"); - goto rm_cmd_fsync_exit; - } - - // file attributes - attr_num_entries = *(size_t*) - (app_config->shm_superblocks[client_side_id] - + app_config->fmeta_offset); - - /* - * file attributes are stored in the superblock shared memory - * created by the client - */ - unifyfs_file_attr_t* attr_payload = (unifyfs_file_attr_t*) - (app_config->shm_superblocks[client_side_id] - + app_config->fmeta_offset + page_sz); - - // allocate storage for values - // TODO: possibly get this from memory pool - fattr_keys = alloc_attr_key_array(attr_num_entries); - fattr_vals = calloc(attr_num_entries, sizeof(unifyfs_file_attr_t*)); - fattr_key_lens = calloc(attr_num_entries, sizeof(int)); - fattr_val_lens = calloc(attr_num_entries, sizeof(int)); - if ((NULL == fattr_keys) || - (NULL == fattr_vals) || - (NULL == fattr_key_lens) || - (NULL == fattr_val_lens)) { - ret = (int)UNIFYFS_ERROR_NOMEM; - goto rm_cmd_fsync_exit; - } - - for (i = 0; i < attr_num_entries; i++) { - *fattr_keys[i] = attr_payload[i].gfid; - fattr_vals[i] = &(attr_payload[i]); - fattr_key_lens[i] = sizeof(fattr_key_t); - fattr_val_lens[i] = sizeof(fattr_val_t); - } - - ret = unifyfs_set_file_attributes((int)attr_num_entries, - fattr_keys, fattr_key_lens, - fattr_vals, fattr_val_lens); - if (ret != UNIFYFS_SUCCESS) { - // TODO: need proper error handling - goto rm_cmd_fsync_exit; - } - -rm_cmd_fsync_exit: - // clean up memory - - if (NULL != unifyfs_keys) { - free_key_array(unifyfs_keys); - } - - if (NULL != unifyfs_vals) { - free_value_array(unifyfs_vals); - } - - if (NULL != unifyfs_key_lens) { - free(unifyfs_key_lens); - } - - if (NULL != unifyfs_val_lens) { - free(unifyfs_val_lens); - } - - if (NULL != fattr_keys) { - free_attr_key_array(fattr_keys); - } - - if (NULL != fattr_vals) { - free(fattr_vals); - } - - if (NULL != fattr_key_lens) { - free(fattr_key_lens); - } - - if (NULL != fattr_val_lens) { - free(fattr_val_lens); + /* wait for reqmgr thread to exit */ + int rc = pthread_join(thrd_ctrl->thrd, NULL); + if (0 == rc) { + pthread_cond_destroy(&(thrd_ctrl->thrd_cond)); + pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); + thrd_ctrl->exited = 1; } - - return ret; + return UNIFYFS_SUCCESS; } /************************ * These functions define the logic of the request manager thread ***********************/ -/* pack the the requests to be sent to the same - * delegator. - * ToDo: pack and send multiple rounds if the - * total request sizes is larger than REQ_BUF_LEN - * @param rank: source rank that sends the requests - * @param req_msg_buf: request buffer - * @param req_num: number of read requests - * @param *tot_sz: the total data size to read in these - * packed read requests - * @return success/error code */ -static int rm_pack_send_requests( - char* req_msg_buf, /* pointer to buffer to pack requests into */ - send_msg_t* send_metas, /* request objects to be packed */ - int req_cnt, /* number of requests */ - size_t* tot_sz) /* total data payload size we're requesting */ -{ - /* tot_sz records the aggregate data size - * requested in this transfer */ - - /* send format: - * (int) cmd - specifies type of message (SVC_CMD_RDREQ_MSG) - * (int) req_num - number of requests in message - * {sequence of send_meta_t requests} */ - size_t packed_size = (2 * sizeof(int)) + (req_cnt * sizeof(send_msg_t)); - - /* get pointer to start of send buffer */ - char* ptr = req_msg_buf; - memset(ptr, 0, packed_size); - - /* pack command */ - int cmd = (int)SVC_CMD_RDREQ_MSG; - *((int*)ptr) = cmd; - ptr += sizeof(int); - - /* pack request count */ - *((int*)ptr) = req_cnt; - ptr += sizeof(int); - - /* pack each request into the send buffer, - * total up incoming bytes as we go */ - int i; - size_t bytes = 0; - for (i = 0; i < req_cnt; i++) { - /* accumulate data size of this request */ - bytes += send_metas[i].length; - } - - /* copy requests into buffer */ - memcpy(ptr, send_metas, (req_cnt * sizeof(send_msg_t))); - ptr += (req_cnt * sizeof(send_msg_t)); - - /* increment running total size of data bytes */ - (*tot_sz) += bytes; - - /* return number of bytes used to pack requests */ - assert(packed_size == (ptr - req_msg_buf)); - return (int)packed_size; -} - -/* pack the chunk read requests for a single remote delegator. +/* pack the chunk read requests for a single remote server. * * @param req_msg_buf: request buffer used for packing * @param req_num: number of read requests * @return size of packed buffer (or error code) */ static size_t rm_pack_chunk_requests(char* req_msg_buf, - remote_chunk_reads_t* remote_reads) + server_chunk_reads_t* remote_reads) { /* send format: - * (int) cmd - specifies type of message (SVC_CMD_RDREQ_CHK) - * (int) req_cnt - number of requests in message + * (int) cmd - specifies type of message (SVC_CMD_RDREQ_CHK) + * (int) req_cnt - number of requests in message + * (size_t) total_sz - total number of bytes requested * {sequence of chunk_read_req_t} */ int req_cnt = remote_reads->num_chunks; size_t reqs_sz = req_cnt * sizeof(chunk_read_req_t); size_t packed_size = (2 * sizeof(int)) + sizeof(size_t) + reqs_sz; + assert(req_cnt <= MAX_META_PER_SEND); + /* get pointer to start of send buffer */ char* ptr = req_msg_buf; memset(ptr, 0, packed_size); @@ -1280,76 +590,13 @@ static size_t rm_pack_chunk_requests(char* req_msg_buf, return packed_size; } -/* send the read requests to the remote delegator service managers - * @param thrd_ctrl : reqmgr thread control structure - * @param tot_sz : output parameter for total size of data to read - * @return success/error code */ -static int rm_send_remote_requests(reqmgr_thrd_t* thrd_ctrl, - size_t* tot_sz) -{ - // NOTE: this fn assumes thrd_ctrl->thrd_lock is locked - - int rc; - int i = 0; - - /* ToDo: Transfer the message in multiple - * rounds when total size > the size of - * send_msg_buf - * */ - - /* use this variable to total up number of incoming data bytes */ - *tot_sz = 0; - - /* get pointer to send buffer */ - char* sendbuf = thrd_ctrl->del_req_msg_buf; - - /* get pointer to start of read request array, - * and initialize index to point to first element */ - send_msg_t* msgs = thrd_ctrl->del_req_set->msg_meta; - int msg_cursor = 0; - - /* iterate over each delegator we need to send requests to */ - for (i = 0; i < thrd_ctrl->del_req_stat->del_cnt; i++) { - /* pointer to start of requests for this delegator */ - send_msg_t* reqs = msgs + msg_cursor; - - /* number of requests for this delegator */ - int req_num = thrd_ctrl->del_req_stat->req_stat[i].req_cnt; - - /* pack requests into send buffer, get size of packed data, - * increase total number of data payload we will get back */ - int packed_size = rm_pack_send_requests(sendbuf, reqs, - req_num, tot_sz); - - /* get rank of target delegator */ - int del_rank = thrd_ctrl->del_req_stat->req_stat[i].del_id; - - /* send requests */ - //MPI_Send(sendbuf, packed_size, MPI_BYTE, - // del_rank, (int)READ_REQUEST_TAG, MPI_COMM_WORLD); - rc = invoke_server_request_rpc(del_rank, 0, (int)READ_REQUEST_TAG, - (void*)sendbuf, (size_t)packed_size); - if (rc != (int)UNIFYFS_SUCCESS) { - LOGERR("server request rpc to %d failed - %s", del_rank, - unifyfs_error_enum_str((unifyfs_error_e)rc)); - } - - /* advance to requests for next delegator */ - msg_cursor += req_num; - } - - return UNIFYFS_SUCCESS; -} - -/* send the chunk read requests to remote delegators +/* send the chunk read requests to remote servers * * @param thrd_ctrl : reqmgr thread control structure * @return success/error code */ static int rm_request_remote_chunks(reqmgr_thrd_t* thrd_ctrl) { - // NOTE: this fn assumes thrd_ctrl->thrd_lock is locked - int i, j, rc; int ret = (int)UNIFYFS_SUCCESS; @@ -1357,29 +604,30 @@ static int rm_request_remote_chunks(reqmgr_thrd_t* thrd_ctrl) char* sendbuf = thrd_ctrl->del_req_msg_buf; /* iterate over each active read request */ + RM_REQ_LOCK(thrd_ctrl); for (i = 0; i < RM_MAX_ACTIVE_REQUESTS; i++) { server_read_req_t* req = thrd_ctrl->read_reqs + i; - if (req->num_remote_reads > 0) { + if (req->num_server_reads > 0) { LOGDBG("read req %d is active", i); debug_print_read_req(req); - if (req->status == READREQ_INIT) { + if (req->status == READREQ_READY) { req->status = READREQ_STARTED; - /* iterate over each delegator we need to send requests to */ - remote_chunk_reads_t* remote_reads; + /* iterate over each server we need to send requests to */ + server_chunk_reads_t* remote_reads; size_t packed_sz; - for (j = 0; j < req->num_remote_reads; j++) { + for (j = 0; j < req->num_server_reads; j++) { remote_reads = req->remote_reads + j; remote_reads->status = READREQ_STARTED; /* pack requests into send buffer, get packed size */ packed_sz = rm_pack_chunk_requests(sendbuf, remote_reads); - /* get rank of target delegator */ + /* get rank of target server */ int del_rank = remote_reads->rank; /* send requests */ LOGDBG("[%d of %d] sending %d chunk requests to server %d", - j, req->num_remote_reads, + j, req->num_server_reads, remote_reads->num_chunks, del_rank); rc = invoke_chunk_read_request_rpc(del_rank, req, remote_reads->num_chunks, @@ -1388,20 +636,25 @@ static int rm_request_remote_chunks(reqmgr_thrd_t* thrd_ctrl) ret = rc; LOGERR("server request rpc to %d failed - %s", del_rank, - unifyfs_error_enum_str((unifyfs_error_e)rc)); + unifyfs_rc_enum_str((unifyfs_rc)rc)); } } } else { /* already started */ LOGDBG("read req %d already processed", i); } + } else if (req->num_server_reads == 0) { + if (req->status == READREQ_READY) { + req->status = READREQ_STARTED; + } } } + RM_REQ_UNLOCK(thrd_ctrl); return ret; } -/* send the chunk read requests to remote delegators +/* process chunk read responses from remote servers * * @param thrd_ctrl : reqmgr thread control structure * @return success/error code @@ -1412,24 +665,51 @@ static int rm_process_remote_chunk_responses(reqmgr_thrd_t* thrd_ctrl) int i, j, rc; int ret = (int)UNIFYFS_SUCCESS; + shm_data_header* shm_hdr; /* iterate over each active read request */ for (i = 0; i < RM_MAX_ACTIVE_REQUESTS; i++) { server_read_req_t* req = thrd_ctrl->read_reqs + i; - if ((req->num_remote_reads > 0) && - (req->status == READREQ_STARTED)) { - /* iterate over each delegator we need to send requests to */ - remote_chunk_reads_t* rcr; - for (j = 0; j < req->num_remote_reads; j++) { - rcr = req->remote_reads + j; - if (NULL == rcr->resp) { - continue; + if (req->status == READREQ_STARTED) { + if (req->num_server_reads > 0) { + /* iterate over each server we sent requests to */ + server_chunk_reads_t* scr; + for (j = 0; j < req->num_server_reads; j++) { + scr = req->remote_reads + j; + if (NULL == scr->resp) { + continue; + } + LOGDBG("found read req %d responses from server %d", + i, scr->rank); + rc = rm_handle_chunk_read_responses(thrd_ctrl, req, scr); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("failed to handle chunk read responses"); + ret = rc; + } + } + } else if (req->num_server_reads == 0) { + /* look up client shared memory region */ + int app_id = req->app_id; + int client_id = req->client_id; + app_client* client = get_app_client(app_id, client_id); + if (NULL != client) { + shm_context* client_shm = client->shmem_data; + assert(NULL != client_shm); + shm_hdr = (shm_data_header*) client_shm->addr; + + /* mark request as complete */ + req->status = READREQ_COMPLETE; + + /* signal client that we're now done writing data */ + client_signal(shm_hdr, SHMEM_REGION_DATA_COMPLETE); + + /* wait for client to read data */ + client_wait(shm_hdr); } - LOGDBG("found read req %d responses from delegator %d", - i, rcr->rank); - rc = rm_handle_chunk_read_responses(thrd_ctrl, req, rcr); + + rc = release_read_req(thrd_ctrl, req); if (rc != (int)UNIFYFS_SUCCESS) { - LOGERR("failed to handle chunk read responses"); + LOGERR("failed to release server_read_req_t"); ret = rc; } } @@ -1439,66 +719,21 @@ static int rm_process_remote_chunk_responses(reqmgr_thrd_t* thrd_ctrl) return ret; } -/* signal the client process for it to start processing read - * data in shared memory */ -static int client_signal(shm_header_t* hdr, - shm_region_state_e flag) +static shm_data_meta* reserve_shmem_meta(shm_context* shmem_data, + size_t data_sz) { - if (flag == SHMEM_REGION_DATA_READY) { - LOGDBG("setting data-ready"); - } else if (flag == SHMEM_REGION_DATA_COMPLETE) { - LOGDBG("setting data-complete"); - } - hdr->state = flag; - /* TODO: MEM_FLUSH */ - return UNIFYFS_SUCCESS; -} - -/* wait until client has processed all read data in shared memory */ -static int client_wait(shm_header_t* hdr) -{ - int rc = (int)UNIFYFS_SUCCESS; + shm_data_meta* meta = NULL; + shm_data_header* hdr = (shm_data_header*) shmem_data->addr; - /* specify time to sleep between checking flag in shared - * memory indicating client has processed data */ - struct timespec shm_wait_tm; - shm_wait_tm.tv_sec = 0; - shm_wait_tm.tv_nsec = SHM_WAIT_INTERVAL; - - /* wait for client to set flag to 0 */ - int max_sleep = 10000000; // 10s - volatile int* vip = (volatile int*)&(hdr->state); - while (*vip != SHMEM_REGION_EMPTY) { - /* not there yet, sleep for a while */ - nanosleep(&shm_wait_tm, NULL); - /* TODO: MEM_FETCH */ - max_sleep--; - if (0 == max_sleep) { - LOGERR("timed out waiting for empty"); - rc = (int)UNIFYFS_ERROR_SHMEM; - break; - } - } - - /* reset header to reflect empty state */ - hdr->meta_cnt = 0; - hdr->bytes = 0; - return rc; -} - -static shm_meta_t* reserve_shmem_meta(app_config_t* app_config, - shm_header_t* hdr, - size_t data_sz) -{ - shm_meta_t* meta = NULL; if (NULL == hdr) { LOGERR("invalid header"); } else { pthread_mutex_lock(&(hdr->sync)); - LOGDBG("shm_header(cnt=%zu, bytes=%zu)", hdr->meta_cnt, hdr->bytes); - size_t remain_size = app_config->recv_buf_sz - - (sizeof(shm_header_t) + hdr->bytes); - size_t meta_size = sizeof(shm_meta_t) + data_sz; + LOGDBG("shm_data_header(cnt=%zu, bytes=%zu)", + hdr->meta_cnt, hdr->bytes); + size_t remain_size = shmem_data->size - + (sizeof(shm_data_header) + hdr->bytes); + size_t meta_size = sizeof(shm_data_meta) + data_sz; if (meta_size > remain_size) { /* client-side receive buffer is full, * inform client to start reading */ @@ -1509,13 +744,14 @@ static shm_meta_t* reserve_shmem_meta(app_config_t* app_config, int rc = client_wait(hdr); if (rc != (int)UNIFYFS_SUCCESS) { LOGERR("wait for client recv buffer space failed"); + pthread_mutex_unlock(&(hdr->sync)); return NULL; } } size_t shm_offset = hdr->bytes; - char* shm_buf = ((char*)hdr) + sizeof(shm_header_t); - meta = (shm_meta_t*)(shm_buf + shm_offset); - LOGDBG("reserved shm_meta[%zu] and %zu payload bytes", + char* shm_buf = ((char*)hdr) + sizeof(shm_data_header); + meta = (shm_data_meta*)(shm_buf + shm_offset); + LOGDBG("reserved shm_data_meta[%zu] and %zu payload bytes", hdr->meta_cnt, data_sz); hdr->meta_cnt++; hdr->bytes += meta_size; @@ -1524,172 +760,6 @@ static shm_meta_t* reserve_shmem_meta(app_config_t* app_config, return meta; } -/* parse the read replies from message received from service manager, - * deliver replies back to client - * - * @param app_id : application id - * @param client_id : local client rank within app - * @param recv_msg_buf : message buffer containing packed read requests - * @param ptr_tot_sz : pointer to total processed data size - * @return success/error code */ -static int rm_process_received_msg(int app_id, - int client_id, - char* recv_msg_buf, - size_t* ptr_tot_sz) -{ - /* assume we'll succeed in processing the message */ - int rc = UNIFYFS_SUCCESS; - - /* look up client app config based on client id */ - app_config_t* app_config = - (app_config_t*)arraylist_get(app_config_list, app_id); - - /* format of read replies in shared memory - * shm_header_t - shared memory region header - * {sequence of shm_meta_t + data payload} */ - - /* get pointer to shared memory buffer for this client */ - size_t header_size = sizeof(shm_header_t); - shm_header_t* hdr = (shm_header_t*)app_config->shm_recv_bufs[client_id]; - - /* format of recv_msg_buf: - * (int) num - number of read replies packed in message - * {sequence of recv_msg_t containing read replies} */ - - /* get pointer to start of receive buffer */ - char* msgptr = recv_msg_buf; - - /* extract number of read requests in this message */ - int num = *(int*)msgptr; - msgptr += sizeof(int); - - /* unpack each read reply */ - int j; - for (j = 0; j < num; j++) { - /* point to first read reply in message */ - recv_msg_t* msg = (recv_msg_t*)msgptr; - msgptr += sizeof(recv_msg_t); - - /* get pointer in shared memory for next read reply */ - shm_meta_t* meta = reserve_shmem_meta(app_config, hdr, msg->length); - if (NULL == meta) { - LOGERR("failed to reserve space for read reply"); - rc = UNIFYFS_FAILURE; - break; - } - char* shmbuf = ((char*)meta) + sizeof(shm_meta_t); - - /* copy in header for this read request */ - meta->gfid = msg->src_fid; - meta->offset = msg->src_offset; - meta->length = msg->length; - meta->errcode = msg->errcode; - - /* copy data for this read request */ - memcpy(shmbuf, msgptr, msg->length); - - /* advance to next read reply in message buffer */ - msgptr += msg->length; - - /* decrement number of bytes processed from total */ - if (NULL != ptr_tot_sz) { - *ptr_tot_sz -= msg->length; - LOGDBG("processed message of size %zu, %zu left to receive", - msg->length, *ptr_tot_sz); - } - } - - return rc; -} - -/* receive the requested data returned from service managers - * as a result of the read requests we sent to them - * - * @param thrd_ctrl: request manager thread state - * @param tot_sz: total data size to receive (excludes header bytes) - * @return success/error code */ -static int rm_receive_remote_message(reqmgr_thrd_t* thrd_ctrl, - size_t tot_sz) -{ - // NOTE: this fn assumes thrd_ctrl->thrd_lock is locked - - /* assume we'll succeed */ - int rc = UNIFYFS_SUCCESS; - - /* get app id and client id that we'll be serving, - * app id associates thread with a namespace (mountpoint) - * the client id associates the thread with a particular - * client process id */ - int app_id = thrd_ctrl->app_id; - int client_id = thrd_ctrl->client_id; - - /* lookup our data structure for this app id */ - app_config_t* app_config = - (app_config_t*)arraylist_get(app_config_list, app_id); - - /* get thread id for this client (used for MPI tags) */ - int thrd_id = app_config->thrd_idxs[client_id]; - - /* service manager will incorporate our thread id in tag, - * to distinguish between target request manager threads */ - int tag = (int)READ_RESPONSE_TAG + thrd_id; - - /* array of MPI_Request objects for window of posted receives */ - MPI_Request recv_req[RECV_BUF_CNT] = {MPI_REQUEST_NULL}; - - /* get number of receives to post and size of each buffer */ - int recv_buf_cnt = RECV_BUF_CNT; - int recv_buf_len = (int) SENDRECV_BUF_LEN; - - /* post a window of receive buffers for incoming data */ - int i; - for (i = 0; i < recv_buf_cnt; i++) { - /* post buffer for incoming receive */ - MPI_Irecv(thrd_ctrl->del_recv_msg_buf[i], recv_buf_len, MPI_BYTE, - MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &recv_req[i]); - } - - /* spin until we have received all incoming data */ - while (tot_sz > 0) { - /* wait for any receive to come in */ - int index; - MPI_Status status; - MPI_Waitany(recv_buf_cnt, recv_req, &index, &status); - - /* got a new message, get pointer to message buffer */ - char* buf = thrd_ctrl->del_recv_msg_buf[index]; - - /* unpack the data into client shared memory, - * this will internally signal client and wait - * for data to be processed if shared memory - * buffer is filled */ - int tmp_rc = rm_process_received_msg(app_id, client_id, buf, &tot_sz); - if (tmp_rc != UNIFYFS_SUCCESS) { - rc = tmp_rc; - } - - /* done processing, repost this receive buffer */ - MPI_Irecv(thrd_ctrl->del_recv_msg_buf[index], recv_buf_len, MPI_BYTE, - MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &recv_req[index]); - } - - /* cancel posted MPI receives */ - for (i = 0; i < recv_buf_cnt; i++) { - MPI_Status status; - MPI_Cancel(&recv_req[i]); - MPI_Wait(&recv_req[i], &status); - } - - /* signal client that we're now done writing data */ - shm_header_t* hdr = (shm_header_t*)app_config->shm_recv_bufs[client_id]; - client_signal(hdr, SHMEM_REGION_DATA_COMPLETE); - - /* wait for client to read data */ - client_wait(hdr); - - return rc; -} - int rm_post_chunk_read_responses(int app_id, int client_id, int src_rank, @@ -1698,31 +768,36 @@ int rm_post_chunk_read_responses(int app_id, size_t bulk_sz, char* resp_buf) { - int rc, thrd_id; - app_config_t* app_config = NULL; - reqmgr_thrd_t* thrd_ctrl = NULL; - server_read_req_t* rdreq = NULL; - remote_chunk_reads_t* del_reads = NULL; + int rc; - /* lookup RM thread control structure for this app id */ - app_config = (app_config_t*) arraylist_get(app_config_list, app_id); - assert(NULL != app_config); - thrd_id = app_config->thrd_idxs[client_id]; - thrd_ctrl = rm_get_thread(thrd_id); + /* get application client */ + app_client* client = get_app_client(app_id, client_id); + if (NULL == client) { + return (int)UNIFYFS_FAILURE; + } + + /* get thread control structure */ + reqmgr_thrd_t* thrd_ctrl = client->reqmgr; assert(NULL != thrd_ctrl); - RM_LOCK(thrd_ctrl); + server_chunk_reads_t* del_reads = NULL; /* find read req associated with req_id */ - rdreq = thrd_ctrl->read_reqs + req_id; - for (int i = 0; i < rdreq->num_remote_reads; i++) { + if (src_rank != glb_pmi_rank) { + /* only need to lock for posting responses from remote servers. + * when response is local, we already have the lock */ + RM_REQ_LOCK(thrd_ctrl); + } + server_read_req_t* rdreq = thrd_ctrl->read_reqs + req_id; + for (int i = 0; i < rdreq->num_server_reads; i++) { if (rdreq->remote_reads[i].rank == src_rank) { del_reads = rdreq->remote_reads + i; break; } } + if (NULL != del_reads) { - LOGDBG("posting chunk responses for req %d from delegator %d", + LOGDBG("posting chunk responses for req %d from server %d", req_id, src_rank); del_reads->resp = (chunk_read_resp_t*)resp_buf; if (del_reads->num_chunks != num_chks) { @@ -1735,15 +810,75 @@ int rm_post_chunk_read_responses(int app_id, LOGERR("failed to find matching chunk-reads request"); rc = (int)UNIFYFS_FAILURE; } + if (src_rank != glb_pmi_rank) { + RM_REQ_UNLOCK(thrd_ctrl); + } /* inform the request manager thread we added responses */ signal_new_responses(thrd_ctrl); - RM_UNLOCK(thrd_ctrl); - return rc; } +static int send_data_to_client(shm_context* shm, chunk_read_resp_t* resp, + char* data, size_t* bytes_processed) +{ + int ret = UNIFYFS_SUCCESS; + int errcode = 0; + size_t offset = 0; + size_t data_size = 0; + size_t bytes_left = 0; + size_t tx_size = MAX_DATA_TX_SIZE; + char* bufpos = data; + shm_data_meta* meta = NULL; + + if (resp->read_rc < 0) { + errcode = (int) -(resp->read_rc); + data_size = 0; + } else { + data_size = resp->nbytes; + } + + /* data can be larger than the shmem buffer size. split the data into + * pieces and send them */ + bytes_left = data_size; + offset = resp->offset; + + for (bytes_left = data_size; bytes_left > 0; bytes_left -= tx_size) { + if (bytes_left < tx_size) { + tx_size = bytes_left; + } + + meta = reserve_shmem_meta(shm, tx_size); + if (meta) { + meta->gfid = resp->gfid; + meta->errcode = errcode; + meta->offset = offset; + meta->length = tx_size; + + LOGDBG("sending data to client (gfid=%d, offset=%zu, length=%zu) " + "%zu bytes left", + resp->gfid, offset, tx_size, bytes_left); + + if (tx_size) { + void* shm_buf = (void*) ((char*) meta + sizeof(shm_data_meta)); + memcpy(shm_buf, bufpos, tx_size); + } + } else { + /* do we need to stop processing and exit loop here? */ + LOGERR("failed to reserve shmem space for read reply"); + ret = UNIFYFS_ERROR_SHMEM; + } + + bufpos += tx_size; + offset += tx_size; + } + + *bytes_processed = data_size - bytes_left; + + return ret; +} + /* process the requested chunk data returned from service managers * * @param thrd_ctrl : request manager thread state @@ -1753,17 +888,16 @@ int rm_post_chunk_read_responses(int app_id, */ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, server_read_req_t* rdreq, - remote_chunk_reads_t* del_reads) + server_chunk_reads_t* del_reads) { - int errcode, gfid, i, num_chks, rc, thrd_id; + // NOTE: this fn assumes thrd_ctrl->thrd_lock is locked + + int i, num_chks, rc; int ret = (int)UNIFYFS_SUCCESS; - app_config_t* app_config = NULL; chunk_read_resp_t* responses = NULL; - shm_header_t* client_shm = NULL; - shm_meta_t* shm_meta = NULL; - void* shm_buf = NULL; + shm_context* client_shm = NULL; + shm_data_header* shm_hdr = NULL; char* data_buf = NULL; - size_t data_sz, offset; assert((NULL != thrd_ctrl) && (NULL != rdreq) && @@ -1771,81 +905,63 @@ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, (NULL != del_reads->resp)); /* look up client shared memory region */ - app_config = (app_config_t*) arraylist_get(app_config_list, rdreq->app_id); - assert(NULL != app_config); - client_shm = (shm_header_t*) app_config->shm_recv_bufs[rdreq->client_id]; - - RM_LOCK(thrd_ctrl); + app_client* clnt = get_app_client(rdreq->app_id, rdreq->client_id); + if (NULL == clnt) { + return (int)UNIFYFS_FAILURE; + } + client_shm = clnt->shmem_data; + shm_hdr = (shm_data_header*) client_shm->addr; num_chks = del_reads->num_chunks; - gfid = rdreq->extent.gfid; if (del_reads->status != READREQ_STARTED) { LOGERR("chunk read response for non-started req @ index=%d", rdreq->req_ndx); - ret = (int32_t)UNIFYFS_ERROR_INVAL; + ret = (int32_t)EINVAL; } else if (0 == del_reads->total_sz) { - LOGERR("empty chunk read response for gfid=%d", gfid); - ret = (int32_t)UNIFYFS_ERROR_INVAL; + LOGERR("empty chunk read response from server %d", del_reads->rank); + ret = (int32_t)EINVAL; } else { LOGDBG("handling chunk read responses from server %d: " - "gfid=%d num_chunks=%d buf_size=%zu", - del_reads->rank, gfid, num_chks, - del_reads->total_sz); + "num_chunks=%d buf_size=%zu", + del_reads->rank, num_chks, del_reads->total_sz); responses = del_reads->resp; data_buf = (char*)(responses + num_chks); + for (i = 0; i < num_chks; i++) { chunk_read_resp_t* resp = responses + i; - if (resp->read_rc < 0) { - errcode = (int)-(resp->read_rc); - data_sz = 0; - } else { - errcode = 0; - data_sz = resp->nbytes; - } - offset = resp->offset; - LOGDBG("chunk response for offset=%zu: sz=%zu", offset, data_sz); - - /* allocate and register local target buffer for bulk access */ - shm_meta = reserve_shmem_meta(app_config, client_shm, data_sz); - if (NULL != shm_meta) { - shm_meta->offset = offset; - shm_meta->length = data_sz; - shm_meta->gfid = gfid; - shm_meta->errcode = errcode; - shm_buf = (void*)((char*)shm_meta + sizeof(shm_meta_t)); - if (data_sz) { - memcpy(shm_buf, data_buf, data_sz); - } - } else { - LOGERR("failed to reserve shmem space for read reply") - ret = (int32_t)UNIFYFS_ERROR_SHMEM; + size_t processed = 0; + + ret = send_data_to_client(client_shm, resp, data_buf, &processed); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to send data to client (ret=%d)", ret); } - data_buf += data_sz; + + data_buf += processed; } + /* cleanup */ free((void*)responses); del_reads->resp = NULL; /* update request status */ del_reads->status = READREQ_COMPLETE; - if (rdreq->status == READREQ_STARTED) { - rdreq->status = READREQ_PARTIAL_COMPLETE; - } + + /* if all remote reads are complete, mark the request as complete */ int completed_remote_reads = 0; - for (i = 0; i < rdreq->num_remote_reads; i++) { + for (i = 0; i < rdreq->num_server_reads; i++) { if (rdreq->remote_reads[i].status != READREQ_COMPLETE) { break; } completed_remote_reads++; } - if (completed_remote_reads == rdreq->num_remote_reads) { + if (completed_remote_reads == rdreq->num_server_reads) { rdreq->status = READREQ_COMPLETE; /* signal client that we're now done writing data */ - client_signal(client_shm, SHMEM_REGION_DATA_COMPLETE); + client_signal(shm_hdr, SHMEM_REGION_DATA_COMPLETE); /* wait for client to read data */ - client_wait(client_shm); + client_wait(shm_hdr); rc = release_read_req(thrd_ctrl, rdreq); if (rc != (int)UNIFYFS_SUCCESS) { @@ -1854,27 +970,433 @@ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, } } - RM_UNLOCK(thrd_ctrl); + return ret; +} + +/* submit a client rpc request to the request manager thread */ +int rm_submit_client_rpc_request(unifyfs_fops_ctx_t* ctx, + client_rpc_req_t* req) +{ + assert((ctx != NULL) && (req != NULL)); + + /* get application client */ + app_client* client = get_app_client(ctx->app_id, ctx->client_id); + if (NULL == client) { + LOGERR("app client [%d:%d] lookup failed", + ctx->app_id, ctx->client_id); + return EINVAL; + } + + /* get thread control structure */ + reqmgr_thrd_t* reqmgr = client->reqmgr; + assert(NULL != reqmgr); + RM_REQ_LOCK(reqmgr); + arraylist_add(reqmgr->client_reqs, req); + RM_REQ_UNLOCK(reqmgr); + + signal_new_requests(reqmgr); + + return UNIFYFS_SUCCESS; +} + +static int process_filesize_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + size_t filesize = 0; + + unifyfs_filesize_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("getting filesize for gfid=%d", gfid); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_filesize(&ctx, gfid, &filesize); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_filesize() failed"); + } + + /* send rpc response */ + unifyfs_filesize_out_t out; + out.ret = (int32_t) ret; + out.filesize = filesize; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); return ret; } -/* entry point for request manager thread, one thread is created - * for each client process, client informs thread of a set of read - * requests, thread retrieves data for client and notifies client - * when data is ready - * - * delegate the read requests for the delegator thread's client. Each - * delegator thread handles one connection to one client-side rank. - * - * @param arg: pointer to control structure for the delegator thread +static int process_fsync_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_fsync_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("syncing gfid=%d", gfid); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_fsync(&ctx, gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_fsync() failed"); + } + + /* send rpc response */ + unifyfs_fsync_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_laminate_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_laminate_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("laminating gfid=%d", gfid); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_laminate(&ctx, gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_laminate() failed"); + } + + /* send rpc response */ + unifyfs_laminate_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_metaget_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_metaget_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("getting metadata for gfid=%d", gfid); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + unifyfs_file_attr_t fattr; + memset(&fattr, 0, sizeof(fattr)); + ret = unifyfs_fops_metaget(&ctx, gfid, &fattr); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_metaget() failed"); + } + + /* send rpc response */ + unifyfs_metaget_out_t out; + out.ret = (int32_t) ret; + out.attr = fattr; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_metaset_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_metaset_in_t* in = req->input; + assert(in != NULL); + int gfid = in->attr.gfid; + int attr_op = (int) in->attr_op; + unifyfs_file_attr_t fattr = in->attr; + if (NULL != in->attr.filename) { + fattr.filename = strdup(in->attr.filename); + } + margo_free_input(req->handle, in); + free(in); + + LOGDBG("setting metadata for gfid=%d", gfid); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_metaset(&ctx, gfid, attr_op, &fattr); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_metaset() failed"); + } + + if (NULL != fattr.filename) { + free(fattr.filename); + } + + /* send rpc response */ + unifyfs_metaset_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_read_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_read_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + off_t offset = in->offset; + size_t len = in->length; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("reading gfid=%d (offset=%zu, length=%zu)", + gfid, (size_t)offset, len); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_read(&ctx, gfid, offset, len); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_read() failed"); + } + + /* send rpc response */ + unifyfs_read_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_truncate_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_truncate_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + size_t filesize = in->filesize; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("truncating gfid=%d, sz=%zu", gfid, filesize); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_truncate(&ctx, gfid, filesize); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_truncate() failed"); + } + + /* send rpc response */ + unifyfs_truncate_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_unlink_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_unlink_in_t* in = req->input; + assert(in != NULL); + int gfid = in->gfid; + margo_free_input(req->handle, in); + free(in); + + LOGDBG("unlinking gfid=%d", gfid); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_unlink(&ctx, gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_unlink() failed"); + } + + /* send rpc response */ + unifyfs_unlink_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + + +/* iterate over list of chunk reads and send responses */ +static int rm_process_client_requests(reqmgr_thrd_t* reqmgr) +{ + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + /* this will hold a list of client requests if we find any */ + arraylist_t* client_reqs = NULL; + + /* lock to access requests */ + RM_REQ_LOCK(reqmgr); + + /* if we have any requests, take pointer to the list + * of requests and replace it with a newly allocated + * list on the request manager structure */ + int num_client_reqs = arraylist_size(reqmgr->client_reqs); + if (num_client_reqs) { + /* got some chunk read requets, take the list and replace + * it with an empty list */ + LOGDBG("processing %d client requests", num_client_reqs); + client_reqs = reqmgr->client_reqs; + reqmgr->client_reqs = arraylist_create(); + } + + /* release lock on reqmgr requests */ + RM_REQ_UNLOCK(reqmgr); + + /* iterate over each chunk read request */ + for (int i = 0; i < num_client_reqs; i++) { + /* process next request */ + int rret; + client_rpc_req_t* req = (client_rpc_req_t*) + arraylist_get(client_reqs, i); + switch (req->req_type) { + case UNIFYFS_CLIENT_RPC_FILESIZE: + rret = process_filesize_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_LAMINATE: + rret = process_laminate_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_METAGET: + rret = process_metaget_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_METASET: + rret = process_metaset_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_READ: + rret = process_read_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_SYNC: + rret = process_fsync_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_TRUNCATE: + rret = process_truncate_rpc(reqmgr, req); + break; + case UNIFYFS_CLIENT_RPC_UNLINK: + rret = process_unlink_rpc(reqmgr, req); + break; + default: + LOGERR("unsupported client rpc request type %d", req->req_type); + rret = UNIFYFS_ERROR_NYI; + break; + } + if (rret != UNIFYFS_SUCCESS) { + LOGERR("client rpc request %d failed (%s)", + i, unifyfs_rc_enum_description(rret)); + ret = rret; + } + } + + /* free the list if we have one */ + if (NULL != client_reqs) { + /* NOTE: this will call free() on each req in the arraylist */ + arraylist_free(client_reqs); + } + + return ret; +} + +/* Entry point for request manager thread. One thread is created + * for each client process to retrieve remote data and notify the + * client when data is ready. * + * @param arg: pointer to RM thread control structure * @return NULL */ -void* rm_delegate_request_thread(void* arg) +void* request_manager_thread(void* arg) { /* get pointer to our thread control structure */ reqmgr_thrd_t* thrd_ctrl = (reqmgr_thrd_t*) arg; + thrd_ctrl->tid = unifyfs_gettid(); LOGDBG("I am request manager thread!"); /* loop forever to handle read requests from the client, @@ -1885,6 +1407,12 @@ void* rm_delegate_request_thread(void* arg) /* grab lock */ RM_LOCK(thrd_ctrl); + /* process any client requests */ + rc = rm_process_client_requests(thrd_ctrl); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to process client rpc requests"); + } + /* process any chunk read responses */ rc = rm_process_remote_chunk_responses(thrd_ctrl); if (rc != UNIFYFS_SUCCESS) { @@ -1893,7 +1421,7 @@ void* rm_delegate_request_thread(void* arg) /* inform dispatcher that we're waiting for work * inside the critical section */ - thrd_ctrl->has_waiting_delegator = 1; + thrd_ctrl->waiting_for_work = 1; /* if dispatcher is waiting on us, signal it to go ahead, * this coordination ensures that we'll be the next thread @@ -1901,22 +1429,23 @@ void* rm_delegate_request_thread(void* arg) * some work (rather than the dispatcher grabbing the lock * and assigning yet more work) */ if (thrd_ctrl->has_waiting_dispatcher == 1) { + /* TODO - should this be pthread_cond_broadcast() since we + * might have multiple requestors waiting? */ pthread_cond_signal(&thrd_ctrl->thrd_cond); } /* release lock and wait to be signaled by dispatcher */ - LOGDBG("RM[%d] waiting for work", thrd_ctrl->thrd_ndx); + LOGDBG("RM[%d:%d] waiting for work", + thrd_ctrl->app_id, thrd_ctrl->client_id); pthread_cond_wait(&thrd_ctrl->thrd_cond, &thrd_ctrl->thrd_lock); + LOGDBG("RM[%d:%d] got work", thrd_ctrl->app_id, thrd_ctrl->client_id); /* set flag to indicate we're no longer waiting */ - thrd_ctrl->has_waiting_delegator = 0; - - /* go do work ... */ - LOGDBG("RM[%d] got work", thrd_ctrl->thrd_ndx); + thrd_ctrl->waiting_for_work = 0; + RM_UNLOCK(thrd_ctrl); - /* release lock and bail out if we've been told to exit */ + /* bail out if we've been told to exit */ if (thrd_ctrl->exit_flag == 1) { - RM_UNLOCK(thrd_ctrl); break; } @@ -1925,28 +1454,6 @@ void* rm_delegate_request_thread(void* arg) if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to request remote chunks"); } - - /* tot_sz tracks the total bytes we expect to receive. - * size is computed during send, decremented during receive */ - size_t tot_sz = 0; - rc = rm_send_remote_requests(thrd_ctrl, &tot_sz); - if (rc != UNIFYFS_SUCCESS) { - /* release lock and exit if we hit an error */ - RM_UNLOCK(thrd_ctrl); - return NULL; - } - if (tot_sz > 0) { - /* wait for data to come back from servers */ - rc = rm_receive_remote_message(thrd_ctrl, tot_sz); - if (rc != UNIFYFS_SUCCESS) { - /* release lock and exit if we hit an error */ - RM_UNLOCK(thrd_ctrl); - return NULL; - } - } - - /* release lock */ - RM_UNLOCK(thrd_ctrl); } LOGDBG("request manager thread exiting"); @@ -1956,130 +1463,15 @@ void* rm_delegate_request_thread(void* arg) /* BEGIN MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ -/* invokes the server_hello rpc */ -int invoke_server_hello_rpc(int dst_srvr_rank) -{ - int rc = (int)UNIFYFS_SUCCESS; - hg_handle_t handle; - server_hello_in_t in; - server_hello_out_t out; - hg_return_t hret; - hg_addr_t dst_srvr_addr; - char hello_msg[UNIFYFS_MAX_HOSTNAME]; - - assert(dst_srvr_rank < (int)glb_num_servers); - dst_srvr_addr = glb_servers[dst_srvr_rank].margo_svr_addr; - - hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, - unifyfsd_rpc_context->rpcs.hello_id, &handle); - assert(hret == HG_SUCCESS); - - /* fill in input struct */ - snprintf(hello_msg, sizeof(hello_msg), "hello from %s", glb_host); - in.src_rank = (int32_t)glb_mpi_rank; - in.message_str = strdup(hello_msg); - - LOGDBG("invoking the server-hello rpc function"); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - rc = (int)UNIFYFS_FAILURE; - } else { - /* decode response */ - hret = margo_get_output(handle, &out); - if (hret == HG_SUCCESS) { - int32_t ret = out.ret; - LOGDBG("Got hello rpc response from %d - ret=%" PRIi32, - dst_srvr_rank, ret); - margo_free_output(handle, &out); - } else { - rc = (int)UNIFYFS_FAILURE; - } - } - - free((void*)in.message_str); - margo_destroy(handle); - - return rc; -} - -/* invokes the server_request rpc */ -int invoke_server_request_rpc(int dst_srvr_rank, int req_id, int tag, - void* data_buf, size_t buf_sz) -{ - int rc = (int)UNIFYFS_SUCCESS; - hg_handle_t handle; - server_request_in_t in; - server_request_out_t out; - hg_return_t hret; - hg_addr_t dst_srvr_addr; - hg_size_t bulk_sz = buf_sz; - - if (dst_srvr_rank == glb_mpi_rank) { - // short-circuit for local requests - if (tag == (int)READ_REQUEST_TAG) { - return sm_decode_msg((char*)data_buf); - } - } - - assert(dst_srvr_rank < (int)glb_num_servers); - dst_srvr_addr = glb_servers[dst_srvr_rank].margo_svr_addr; - - hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, - unifyfsd_rpc_context->rpcs.request_id, &handle); - assert(hret == HG_SUCCESS); - - /* fill in input struct */ - in.src_rank = (int32_t)glb_mpi_rank; - in.req_id = (int32_t)req_id; - in.req_tag = (int32_t)tag; - in.bulk_size = bulk_sz; - - /* register request buffer for bulk remote access */ - hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, - &data_buf, &bulk_sz, - HG_BULK_READ_ONLY, &in.bulk_handle); - assert(hret == HG_SUCCESS); - - LOGDBG("invoking the server-request rpc function"); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - rc = (int)UNIFYFS_FAILURE; - } else { - /* decode response */ - hret = margo_get_output(handle, &out); - if (hret == HG_SUCCESS) { - rc = (int)out.ret; - LOGDBG("Got request rpc response from %d - ret=%d", - dst_srvr_rank, rc); - margo_free_output(handle, &out); - } else { - rc = (int)UNIFYFS_FAILURE; - } - } - - margo_bulk_free(in.bulk_handle); - margo_destroy(handle); - - return rc; -} - /* invokes the server_request rpc */ int invoke_chunk_read_request_rpc(int dst_srvr_rank, server_read_req_t* rdreq, int num_chunks, void* data_buf, size_t buf_sz) { - int rc = (int)UNIFYFS_SUCCESS; - hg_handle_t handle; - chunk_read_request_in_t in; - chunk_read_request_out_t out; - hg_return_t hret; - hg_addr_t dst_srvr_addr; - hg_size_t bulk_sz = buf_sz; - - if (dst_srvr_rank == glb_mpi_rank) { + if (dst_srvr_rank == glb_pmi_rank) { // short-circuit for local requests - return sm_issue_chunk_reads(glb_mpi_rank, + return sm_issue_chunk_reads(glb_pmi_rank, rdreq->app_id, rdreq->client_id, rdreq->req_ndx, @@ -2087,16 +1479,27 @@ int invoke_chunk_read_request_rpc(int dst_srvr_rank, (char*)data_buf); } + int ret = UNIFYFS_SUCCESS; + hg_handle_t handle; + chunk_read_request_in_t in; + chunk_read_request_out_t out; + hg_return_t hret; + hg_addr_t dst_srvr_addr; + hg_size_t bulk_sz = buf_sz; + assert(dst_srvr_rank < (int)glb_num_servers); dst_srvr_addr = glb_servers[dst_srvr_rank].margo_svr_addr; hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, unifyfsd_rpc_context->rpcs.chunk_read_request_id, &handle); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_create() failed"); + return UNIFYFS_ERROR_MARGO; + } /* fill in input struct */ - in.src_rank = (int32_t)glb_mpi_rank; + in.src_rank = (int32_t)glb_pmi_rank; in.app_id = (int32_t)rdreq->app_id; in.client_id = (int32_t)rdreq->client_id; in.req_id = (int32_t)rdreq->req_ndx; @@ -2107,98 +1510,165 @@ int invoke_chunk_read_request_rpc(int dst_srvr_rank, hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, &data_buf, &bulk_sz, HG_BULK_READ_ONLY, &in.bulk_handle); - assert(hret == HG_SUCCESS); - - LOGDBG("invoking the chunk-read-request rpc function"); - hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { - rc = (int)UNIFYFS_FAILURE; + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; } else { - /* decode response */ - hret = margo_get_output(handle, &out); - if (hret == HG_SUCCESS) { - rc = (int)out.ret; - LOGDBG("Got request rpc response from %d - ret=%d", - dst_srvr_rank, rc); - margo_free_output(handle, &out); + LOGDBG("invoking the chunk-read-request rpc function"); + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + ret = UNIFYFS_ERROR_MARGO; } else { - rc = (int)UNIFYFS_FAILURE; + /* decode response */ + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + ret = (int)out.ret; + LOGDBG("Got request rpc response from %d - ret=%d", + dst_srvr_rank, ret); + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } } - } - margo_bulk_free(in.bulk_handle); + margo_bulk_free(in.bulk_handle); + } margo_destroy(handle); - return rc; + return ret; } + /* BEGIN MARGO SERVER-SERVER RPC HANDLER FUNCTIONS */ /* handler for remote read request response */ static void chunk_read_response_rpc(hg_handle_t handle) { - int rc, src_rank, req_id; - int app_id, client_id, thrd_id; - int i, num_chks; int32_t ret; - hg_return_t hret; - hg_bulk_t bulk_handle; - size_t bulk_sz; - chunk_read_response_in_t in; chunk_read_response_out_t out; - void* resp_buf = NULL; /* get input params */ - rc = margo_get_input(handle, &in); - assert(rc == HG_SUCCESS); - src_rank = (int)in.src_rank; - app_id = (int)in.app_id; - client_id = (int)in.client_id; - req_id = (int)in.req_id; - num_chks = (int)in.num_chks; - bulk_sz = (size_t)in.bulk_size; - - if (0 == bulk_sz) { - LOGERR("empty response buffer"); - ret = (int32_t)UNIFYFS_ERROR_INVAL; + chunk_read_response_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = (int32_t) UNIFYFS_ERROR_MARGO; } else { - resp_buf = malloc(bulk_sz); - if (NULL == resp_buf) { - LOGERR("failed to allocate chunk read responses buffer"); - ret = (int32_t)UNIFYFS_ERROR_NOMEM; + /* extract params from input struct */ + int src_rank = (int)in.src_rank; + int app_id = (int)in.app_id; + int client_id = (int)in.client_id; + int req_id = (int)in.req_id; + int num_chks = (int)in.num_chks; + size_t bulk_sz = (size_t)in.bulk_size; + + LOGDBG("received chunk read response from server %d (%d chunks)", + src_rank, num_chks); + + /* The input parameters specify the info for a bulk transfer + * buffer on the sending process. We use that info to pull data + * from the sender into a local buffer. This buffer contains + * the read reply headers and associated read data for requests + * we had sent earlier. */ + + /* pull the remote data via bulk transfer */ + if (0 == bulk_sz) { + /* sender is trying to send an empty buffer, + * don't think that should happen unless maybe + * we had sent a read request list that was empty? */ + LOGERR("empty response buffer"); + ret = (int32_t)EINVAL; } else { - /* pull response data */ - ret = (int32_t)UNIFYFS_SUCCESS; - const struct hg_info* hgi = margo_get_info(handle); - assert(NULL != hgi); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - hret = margo_bulk_create(mid, 1, &resp_buf, &in.bulk_size, - HG_BULK_WRITE_ONLY, &bulk_handle); - assert(hret == HG_SUCCESS); - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_handle, 0, - bulk_handle, 0, in.bulk_size); - assert(hret == HG_SUCCESS); - - rc = rm_post_chunk_read_responses(app_id, client_id, - src_rank, req_id, num_chks, - bulk_sz, (char*)resp_buf); - if (rc != (int)UNIFYFS_SUCCESS) { - LOGERR("failed to handle chunk read responses") - ret = rc; + /* allocate a buffer to hold the incoming data */ + char* resp_buf = (char*) malloc(bulk_sz); + if (NULL == resp_buf) { + /* allocation failed, that's bad */ + LOGERR("failed to allocate chunk read responses buffer"); + ret = (int32_t)ENOMEM; + } else { + /* got a buffer, now pull response data */ + ret = (int32_t)UNIFYFS_SUCCESS; + + /* get margo info */ + const struct hg_info* hgi = margo_get_info(handle); + assert(NULL != hgi); + + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + /* pass along address of buffer we want to transfer + * data into to prepare it for a bulk write, + * get resulting margo handle */ + hg_bulk_t bulk_handle; + hret = margo_bulk_create(mid, 1, + (void**)&resp_buf, &in.bulk_size, + HG_BULK_WRITE_ONLY, &bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + goto out_respond; + } + + /* execute the transfer to pull data from remote side + * into our local bulk transfer buffer. + * NOTE: mercury/margo bulk transfer does not check the maximum + * transfer size that the underlying transport supports, and a + * large bulk transfer may result in failure. */ + int i = 0; + hg_size_t remain = in.bulk_size; + do { + hg_size_t offset = i * MAX_BULK_TX_SIZE; + hg_size_t len = remain < MAX_BULK_TX_SIZE + ? remain : MAX_BULK_TX_SIZE; + + hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, + in.bulk_handle, offset, + bulk_handle, offset, len); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer(off=%zu, sz=%zu) failed", + (size_t)offset, (size_t)len); + ret = UNIFYFS_ERROR_MARGO; + break; + } + + remain -= len; + i++; + } while (remain > 0); + + if (hret == HG_SUCCESS) { + LOGDBG("successful bulk transfer (%zu bytes)", bulk_sz); + + /* process read replies we just received */ + int rc = rm_post_chunk_read_responses(app_id, client_id, + src_rank, req_id, + num_chks, bulk_sz, + resp_buf); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to handle chunk read responses"); + ret = rc; + } + } else { + LOGERR("failed to perform bulk transfer"); + } + + /* deregister our bulk transfer buffer */ + margo_bulk_free(bulk_handle); } - margo_bulk_free(bulk_handle); } + margo_free_input(handle, &in); } - /* fill output structure and return to caller */ +out_respond: + /* return to caller */ out.ret = ret; hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } /* free margo resources */ - margo_free_input(handle, &in); margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(chunk_read_response_rpc) diff --git a/server/src/unifyfs_request_manager.h b/server/src/unifyfs_request_manager.h index 464a4a18c..215efa181 100644 --- a/server/src/unifyfs_request_manager.h +++ b/server/src/unifyfs_request_manager.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017-2019, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -31,76 +31,63 @@ #define UNIFYFS_REQUEST_MANAGER_H #include "unifyfs_global.h" +#include "unifyfs_metadata_mdhim.h" -/* one entry per delegator for which we have active read requests, - * records rank of delegator and request count */ -typedef struct { - int req_cnt; /* number of requests to this delegator */ - int del_id; /* rank of delegator */ -} per_del_stat_t; - -/* records list of delegator information (rank, req count) for - * set of delegators we have active read requests for */ typedef struct { - per_del_stat_t* req_stat; /* delegator rank and request count */ - int del_cnt; /* number of delegators we have read requests for */ -} del_req_stat_t; - -// NEW READ REQUEST STRUCTURES + client_rpc_e req_type; + hg_handle_t handle; + void* input; + void* bulk_buf; + size_t bulk_sz; +} client_rpc_req_t; typedef struct { readreq_status_e status; /* aggregate request status */ + int in_use; /* currently using this req? */ int req_ndx; /* index in reqmgr read_reqs array */ int app_id; /* app id of requesting client process */ int client_id; /* client id of requesting client process */ - int num_remote_reads; /* size of remote_reads array */ - client_read_req_t extent; /* client read extent, includes gfid */ + int num_server_reads; /* size of remote_reads array */ chunk_read_req_t* chunks; /* array of chunk-reads */ - remote_chunk_reads_t* remote_reads; /* per-delegator remote reads array */ + server_chunk_reads_t* remote_reads; /* per-server remote reads array */ } server_read_req_t; -/* this structure is created by the main thread for each request - * manager thread, contains shared data structures where main thread - * issues read requests and request manager processes them, contains - * condition variable and lock for coordination between threads */ -typedef struct { - /* request manager thread */ +/* Request manager state structure - created by main thread for each request + * manager thread. Contains shared data structures for client-server and + * server-server requests and associated synchronization constructs */ +typedef struct reqmgr_thrd { + /* request manager (RM) thread */ pthread_t thrd; + pid_t tid; /* condition variable to synchronize request manager thread - * and main thread delivering work */ - pthread_cond_t thrd_cond; + * and margo rpc handler ULT delivering work */ + pthread_cond_t thrd_cond; /* lock for shared data structures (variables below) */ pthread_mutex_t thrd_lock; - /* flag indicating that request manager thread is waiting - * for work inside of critical region */ - int has_waiting_delegator; + /* flag indicating request manager thread is waiting on thrd_cond CV */ + int waiting_for_work; - /* flag indicating main thread is in critical section waiting - * for request manager thread */ + /* flag indicating a margo rpc handler ULT is waiting on thrd_cond CV */ int has_waiting_dispatcher; + /* argobots mutex for synchronizing access to request state between + * margo rpc handler ULTs and request manager thread */ + ABT_mutex reqs_sync; + + /* array of server read requests */ int num_read_reqs; int next_rdreq_ndx; server_read_req_t read_reqs[RM_MAX_ACTIVE_REQUESTS]; - /* a list of read requests to be sent to each delegator, - * main thread adds items to this list, request manager - * processes them */ - msg_meta_t* del_req_set; - - /* statistics of read requests to be sent to each delegator */ - del_req_stat_t* del_req_stat; + /* list of client rpc requests */ + arraylist_t* client_reqs; /* buffer to build read request messages */ char del_req_msg_buf[REQ_BUF_LEN]; - /* memory for posting receives for incoming read reply messages - * from the service threads */ - char del_recv_msg_buf[RECV_BUF_CNT][SENDRECV_BUF_LEN]; - /* flag set to indicate request manager thread should exit */ int exit_flag; @@ -112,44 +99,31 @@ typedef struct { /* client_id this thread is serving */ int client_id; - - /* index within rm_thrd_list */ - int thrd_ndx; } reqmgr_thrd_t; +/* reserve/release read requests */ +server_read_req_t* rm_reserve_read_req(reqmgr_thrd_t* thrd_ctrl); +int rm_release_read_req(reqmgr_thrd_t* thrd_ctrl, + server_read_req_t* rdreq); + +/* issue remote chunk read requests for extent chunks + * listed within keyvals */ +int rm_create_chunk_requests(reqmgr_thrd_t* thrd_ctrl, + server_read_req_t* rdreq, + int num_vals, + unifyfs_keyval_t* keyvals); /* create Request Manager thread for application client */ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id); -/* lookup Request Manager thread by index */ -reqmgr_thrd_t* rm_get_thread(int thrd_id); - /* Request Manager pthread main */ -void* rm_delegate_request_thread(void* arg); - -/* functions called by rpc handlers to assign work - * to request manager threads */ -int rm_cmd_mread(int app_id, int client_id, - size_t req_num, void* reqbuf); - -int rm_cmd_read(int app_id, int client_id, int gfid, - size_t offset, size_t length); - -int rm_cmd_filesize(int app_id, int client_id, int gfid, size_t* outsize); +void* request_manager_thread(void* arg); /* function called by main thread to instruct * resource manager thread to exit, * returns UNIFYFS_SUCCESS on success */ -int rm_cmd_exit(reqmgr_thrd_t* thrd_ctrl); - -/* - * synchronize all the indices and file attributes - * to the key-value store - * @param sock_id: the connection id in poll_set of the delegator - * @return success/error code - */ -int rm_cmd_fsync(int app_id, int client_side_id, int gfid); +int rm_request_exit(reqmgr_thrd_t* thrd_ctrl); /* update state for remote chunk reads with received response data */ int rm_post_chunk_read_responses(int app_id, @@ -163,16 +137,31 @@ int rm_post_chunk_read_responses(int app_id, /* process the requested chunk data returned from service managers */ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, server_read_req_t* rdreq, - remote_chunk_reads_t* del_reads); + server_chunk_reads_t* del_reads); -/* MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ +/** + * @brief hand over a read request to the request manager thread. + * + * @param req all members except for status and req_ndx need to be filled by + * the caller. @req->chunks and @req->remote_reads should be allocated from + * heap, and should not be freed by the caller. + * + * @return 0 on success, errno otherwise + */ +int rm_submit_read_request(server_read_req_t* req); -int invoke_server_hello_rpc(int dst_srvr_rank); +/** + * @brief submit a client rpc request to the request manager thread. + * + * @param client application client context + * @param req pointer to client rpc request struct + * + * @return UNIFYFS_SUCCESS, or error code + */ +int rm_submit_client_rpc_request(unifyfs_fops_ctx_t* ctx, + client_rpc_req_t* req); -int invoke_server_request_rpc(int dst_srvr_rank, - int req_id, - int tag, - void* data_buf, size_t buf_sz); +/* MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ int invoke_chunk_read_request_rpc(int dst_srvr_rank, server_read_req_t* rdreq, diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c new file mode 100644 index 000000000..d77bbd141 --- /dev/null +++ b/server/src/unifyfs_server.c @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * Copyright (c) 2017, Florida State University. Contributions from + * the Computer Architecture and Systems Research Laboratory (CASTL) + * at the Department of Computer Science. + * + * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror + * LLNL-CODE-728877. All rights reserved. + * + * This file is part of burstfs. + * For details, see https://github.com/llnl/burstfs + * Please read https://github.com/llnl/burstfs/LICENSE for full license text. + */ + +// system headers +#include +#include + +// common headers +#include "unifyfs_configurator.h" +#include "unifyfs_keyval.h" + +// server components +#include "unifyfs_global.h" +#include "unifyfs_metadata_mdhim.h" +#include "unifyfs_request_manager.h" +#include "unifyfs_service_manager.h" +#include "unifyfs_inode_tree.h" + +// margo rpcs +#include "margo_server.h" + +/* PMI information */ +int glb_pmi_rank; /* = 0 */ +int glb_pmi_size = 1; // for standalone server tests +int server_pid; + +char glb_host[UNIFYFS_MAX_HOSTNAME]; +size_t glb_host_ndx; // index of localhost in glb_servers + +size_t glb_num_servers; // size of glb_servers array +server_info_t* glb_servers; // array of server_info_t + +unifyfs_cfg_t server_cfg; + +static ABT_mutex app_configs_abt_sync; +static app_config* app_configs[MAX_NUM_APPS]; /* list of apps */ +static size_t clients_per_app = MAX_APP_CLIENTS; + +/** + * @brief create a ready status file to notify that all servers are ready for + * accepting client requests. + * + * @return 0 on success, error otherwise + */ +int unifyfs_publish_server_pids(void); + +static int unifyfs_exit(void); + +#if defined(UNIFYFS_MULTIPLE_DELEGATORS) +int* local_rank_lst; +int local_rank_cnt; + +static int CountTasksPerNode(int rank, int numTasks); +static int find_rank_idx(int my_rank); +#endif + +struct unifyfs_fops* global_fops_tab; + +/* + * Perform steps to create a daemon process: + * + * 1. Fork and exit from parent so child runs in the background + * 2. Set the daemon umask to 0 so file modes passed to open() and + * mkdir() fully control access modes + * 3. Call setsid() to create a new session and detach from controlling tty + * 4. Change current working directory to / so daemon doesn't block + * filesystem unmounts + * 5. close STDIN, STDOUT, and STDERR + * 6. Fork again to abdicate session leader position to guarantee + * daemon cannot reacquire a controlling TTY + * + */ +static void daemonize(void) +{ + pid_t pid; + pid_t sid; + int rc; + + pid = fork(); + + if (pid < 0) { + LOGERR("fork failed: %s", strerror(errno)); + exit(1); + } + + if (pid > 0) { + exit(0); + } + + umask(0); + + sid = setsid(); + if (sid < 0) { + LOGERR("setsid failed: %s", strerror(errno)); + exit(1); + } + + rc = chdir("/"); + if (rc < 0) { + LOGERR("chdir failed: %s", strerror(errno)); + exit(1); + } + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + + pid = fork(); + if (pid < 0) { + LOGERR("fork failed: %s", strerror(errno)); + exit(1); + } else if (pid > 0) { + exit(0); + } +} + +static int time_to_exit; +void exit_request(int sig) +{ +#ifdef HAVE_STRSIGNAL + const char* sigstr = strsignal(sig); + LOGDBG("got signal %s", sigstr); +#endif + + switch (sig) { + case SIGINT: + case SIGQUIT: + case SIGTERM: + time_to_exit = 1; + LOGDBG("exit requested"); + break; + default: + LOGERR("unhandled signal %d", sig); + break; + } +} + +#if defined(UNIFYFSD_USE_MPI) +static void init_MPI(int* argc, char*** argv) +{ + int rc, provided; + rc = MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); + if (rc != MPI_SUCCESS) { + exit(1); + } + + rc = MPI_Comm_rank(MPI_COMM_WORLD, &glb_pmi_rank); + if (rc != MPI_SUCCESS) { + exit(1); + } + + rc = MPI_Comm_size(MPI_COMM_WORLD, &glb_pmi_size); + if (rc != MPI_SUCCESS) { + exit(1); + } +} + +static void fini_MPI(void) +{ + MPI_Finalize(); +} +#endif // UNIFYFSD_USE_MPI + +static int allocate_servers(size_t n_servers) +{ + glb_num_servers = n_servers; + glb_servers = (server_info_t*) calloc(n_servers, sizeof(server_info_t)); + if (NULL == glb_servers) { + LOGERR("failed to allocate server_info array"); + return ENOMEM; + } + return (int)UNIFYFS_SUCCESS; +} + +static int process_servers_hostfile(const char* hostfile) +{ + int rc; + size_t i, cnt; + FILE* fp = NULL; + char hostbuf[UNIFYFS_MAX_HOSTNAME+1]; + + if (NULL == hostfile) { + return EINVAL; + } + fp = fopen(hostfile, "r"); + if (!fp) { + LOGERR("failed to open hostfile %s", hostfile); + return (int)UNIFYFS_FAILURE; + } + + // scan first line: number of hosts + rc = fscanf(fp, "%zu\n", &cnt); + if (1 != rc) { + LOGERR("failed to scan hostfile host count"); + fclose(fp); + return (int)UNIFYFS_FAILURE; + } + rc = allocate_servers(cnt); + if ((int)UNIFYFS_SUCCESS != rc) { + fclose(fp); + return (int)UNIFYFS_FAILURE; + } + + // scan host lines + for (i = 0; i < cnt; i++) { + memset(hostbuf, 0, sizeof(hostbuf)); + rc = fscanf(fp, "%s\n", hostbuf); + if (1 != rc) { + LOGERR("failed to scan hostfile host line %zu", i); + fclose(fp); + return (int)UNIFYFS_FAILURE; + } + + // NOTE: following assumes one server per host + if (0 == strcmp(glb_host, hostbuf)) { + glb_host_ndx = (int)i; + LOGDBG("found myself at hostfile index=%zu, pmi_rank=%d", + glb_host_ndx, glb_pmi_rank); + } + } + fclose(fp); + + if (glb_pmi_size < cnt) { + glb_pmi_rank = (int)glb_host_ndx; + glb_pmi_size = (int)cnt; + LOGDBG("set pmi rank to host index %d", glb_pmi_rank); + } + + return (int)UNIFYFS_SUCCESS; +} + +int main(int argc, char* argv[]) +{ + int rc; + int kv_rank, kv_nranks; + bool daemon = true; + struct sigaction sa; + char rank_str[16] = {0}; + char dbg_fname[UNIFYFS_MAX_FILENAME] = {0}; + + rc = unifyfs_config_init(&server_cfg, argc, argv); + if (rc != 0) { + exit(1); + } + server_cfg.ptype = UNIFYFS_SERVER; + + // to daemon or not to daemon, that is the question + rc = configurator_bool_val(server_cfg.unifyfs_daemonize, &daemon); + if (rc != 0) { + exit(1); + } + if (daemon) { + daemonize(); + } + + server_pid = getpid(); + + /* unifyfs default log level is LOG_ERR */ + if (server_cfg.log_verbosity != NULL) { + long l; + rc = configurator_int_val(server_cfg.log_verbosity, &l); + if (0 == rc) { + unifyfs_set_log_level((unifyfs_log_level_t)l); + } + } + + // setup clean termination by signal + memset(&sa, 0, sizeof(struct sigaction)); + sa.sa_handler = exit_request; + rc = sigemptyset(&sa.sa_mask); + rc = sigaction(SIGINT, &sa, NULL); + rc = sigaction(SIGQUIT, &sa, NULL); + rc = sigaction(SIGTERM, &sa, NULL); + + // update clients_per_app based on configuration + if (server_cfg.server_max_app_clients != NULL) { + long l; + rc = configurator_int_val(server_cfg.server_max_app_clients, &l); + if (0 == rc) { + clients_per_app = l; + } + } + + // initialize empty app_configs[] + memset(app_configs, 0, sizeof(app_configs)); + +#if defined(UNIFYFSD_USE_MPI) + init_MPI(&argc, &argv); +#endif + + // start logging + gethostname(glb_host, sizeof(glb_host)); + snprintf(dbg_fname, sizeof(dbg_fname), "%s/%s.%s", + server_cfg.log_dir, server_cfg.log_file, glb_host); + rc = unifyfs_log_open(dbg_fname); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("%s", unifyfs_rc_enum_description((unifyfs_rc)rc)); + } + + if (NULL != server_cfg.server_hostfile) { + rc = process_servers_hostfile(server_cfg.server_hostfile); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("failed to gather server information"); + exit(1); + } + } + + kv_rank = glb_pmi_rank; + kv_nranks = glb_pmi_size; + rc = unifyfs_keyval_init(&server_cfg, &kv_rank, &kv_nranks); + if (rc != (int)UNIFYFS_SUCCESS) { + exit(1); + } + if (glb_pmi_rank != kv_rank) { + LOGDBG("mismatch on pmi (%d) vs kvstore (%d) rank", + glb_pmi_rank, kv_rank); + glb_pmi_rank = kv_rank; + } + if (glb_pmi_size != kv_nranks) { + LOGDBG("mismatch on pmi (%d) vs kvstore (%d) num ranks", + glb_pmi_size, kv_nranks); + glb_pmi_size = kv_nranks; + } + + snprintf(rank_str, sizeof(rank_str), "%d", glb_pmi_rank); + rc = unifyfs_keyval_publish_remote(key_unifyfsd_pmi_rank, rank_str); + if (rc != (int)UNIFYFS_SUCCESS) { + exit(1); + } + + if (NULL == server_cfg.server_hostfile) { + //glb_svr_rank = kv_rank; + rc = allocate_servers((size_t)kv_nranks); + } + + LOGDBG("initializing rpc service"); + ABT_init(argc, argv); + ABT_mutex_create(&app_configs_abt_sync); + rc = configurator_bool_val(server_cfg.margo_tcp, &margo_use_tcp); + rc = margo_server_rpc_init(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("%s", unifyfs_rc_enum_description(rc)); + exit(1); + } + + LOGDBG("connecting rpc servers"); + rc = margo_connect_servers(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("%s", unifyfs_rc_enum_description(rc)); + exit(1); + } + + /* launch the service manager */ + LOGDBG("launching service manager thread"); + rc = svcmgr_init(); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("launch failed - %s", unifyfs_rc_enum_description(rc)); + exit(1); + } + + LOGDBG("initializing file operations"); + rc = unifyfs_fops_init(&server_cfg); + if (rc != 0) { + LOGERR("%s", unifyfs_rc_enum_description(rc)); + exit(1); + } + + /* initialize our tree that maps a gfid to its extent tree */ + unifyfs_inode_tree_init(global_inode_tree); + + LOGDBG("publishing server pid"); + rc = unifyfs_publish_server_pids(); + if (rc != 0) { + LOGERR("failed to publish server pid file: %s", + unifyfs_rc_enum_description(rc)); + exit(1); + } + + LOGINFO("server[%d] - finished initialization", glb_pmi_rank); + + while (1) { + sleep(1); + if (time_to_exit) { + LOGDBG("starting service shutdown"); + break; + } + } + + /* tear down gfid-to-extents tree */ + unifyfs_inode_tree_destroy(global_inode_tree); + + LOGDBG("stopping service manager thread"); + rc = svcmgr_fini(); + + return unifyfs_exit(); +} + +#if defined(UNIFYFSD_USE_MPI) +#if defined(UNIFYFS_MULTIPLE_DELEGATORS) +/* count the number of delegators per node, and + * the rank of each delegator, the results are stored + * in local_rank_cnt and local_rank_lst. + * @param numTasks: number of processes in the communicator + * @return success/error code */ +static int CountTasksPerNode(int rank, int numTasks) +{ + char localhost[UNIFYFS_MAX_HOSTNAME]; + char hostname[UNIFYFS_MAX_HOSTNAME]; + int resultsLen = UNIFYFS_MAX_HOSTNAME; + + MPI_Status status; + int i, j, rc; + + if (numTasks < 0) { + return -1; + } + + rc = MPI_Get_processor_name(localhost, &resultsLen); + if (rc != 0) { + return -1; + } + + if (rank == 0) { + /* a container of (rank, host) mappings */ + name_rank_pair_t* host_set = + (name_rank_pair_t*)calloc(numTasks, sizeof(name_rank_pair_t)); + /* MPI_Recv all hostnames, and compare to local hostname */ + for (i = 1; i < numTasks; i++) { + rc = MPI_Recv(hostname, UNIFYFS_MAX_HOSTNAME, + MPI_CHAR, MPI_ANY_SOURCE, + MPI_ANY_TAG, + MPI_COMM_WORLD, &status); + if (rc != 0) { + return -1; + } + strcpy(host_set[i].hostname, hostname); + host_set[i].rank = status.MPI_SOURCE; + } + strcpy(host_set[0].hostname, localhost); + host_set[0].rank = 0; + + /* sort by hostname */ + qsort(host_set, numTasks, sizeof(name_rank_pair_t), + compare_name_rank_pair); + + /* rank_cnt: records the number of processes on each host + * rank_set: the list of ranks for each host */ + int** rank_set = (int**)calloc(numTasks, sizeof(int*)); + int* rank_cnt = (int*)calloc(numTasks, sizeof(int)); + + int cursor = 0; + int set_counter = 0; + for (i = 1; i < numTasks; i++) { + if (strcmp(host_set[i].hostname, + host_set[i - 1].hostname) != 0) { + // found a different host, so switch to a new set + int hiter, riter = 0; + rank_set[set_counter] = + (int*)calloc((i - cursor), sizeof(int)); + rank_cnt[set_counter] = i - cursor; + for (hiter = cursor; hiter < i; hiter++, riter++) { + rank_set[set_counter][riter] = host_set[hiter].rank; + } + + set_counter++; + cursor = i; + } + } + + /* fill rank_cnt and rank_set entry for the last host */ + + rank_set[set_counter] = + (int*)calloc((i - cursor), sizeof(int)); + rank_cnt[set_counter] = numTasks - cursor; + j = 0; + for (i = cursor; i < numTasks; i++, j++) { + rank_set[set_counter][j] = host_set[i].rank; + } + set_counter++; + + /* broadcast rank_set information */ + int root_set_no = -1; + for (i = 0; i < set_counter; i++) { + /* send rank set to each of its ranks */ + for (j = 0; j < rank_cnt[i]; j++) { + if (rank_set[i][j] != 0) { + rc = MPI_Send(&rank_cnt[i], 1, MPI_INT, + rank_set[i][j], 0, MPI_COMM_WORLD); + if (rc != 0) { + return -1; + } + rc = MPI_Send(rank_set[i], rank_cnt[i], MPI_INT, + rank_set[i][j], 0, MPI_COMM_WORLD); + if (rc != 0) { + return -1; + } + } else { + root_set_no = i; + local_rank_cnt = rank_cnt[i]; + local_rank_lst = (int*)calloc(rank_cnt[i], sizeof(int)); + memcpy(local_rank_lst, rank_set[i], + (local_rank_cnt * sizeof(int))) + } + } + } + + for (i = 0; i < set_counter; i++) { + free(rank_set[i]); + } + free(rank_cnt); + free(host_set); + free(rank_set); + } else { /* non-root rank */ + /* MPI_Send hostname to root */ + rc = MPI_Send(localhost, UNIFYFS_MAX_HOSTNAME, MPI_CHAR, + 0, 0, MPI_COMM_WORLD); + if (rc != 0) { + return -1; + } + /* receive the local rank set count */ + rc = MPI_Recv(&local_rank_cnt, 1, MPI_INT, 0, + 0, MPI_COMM_WORLD, &status); + if (rc != 0) { + return -1; + } + /* receive the the local rank set */ + local_rank_lst = (int*)calloc(local_rank_cnt, sizeof(int)); + rc = MPI_Recv(local_rank_lst, local_rank_cnt, MPI_INT, 0, + 0, MPI_COMM_WORLD, &status); + if (rc != 0) { + free(local_rank_lst); + return -1; + } + } + + /* sort by rank */ + qsort(local_rank_lst, local_rank_cnt, sizeof(int), compare_int); + + return 0; +} + +static int find_rank_idx(int my_rank) +{ + int i; + assert(local_rank_lst != NULL); + for (i = 0; i < local_rank_cnt; i++) { + if (local_rank_lst[i] == my_rank) { + return i; + } + } + return -1; +} + +#endif // UNIFYFS_MULTIPLE_DELEGATORS +#endif // UNIFYFSD_USE_MPI + + +static int unifyfs_exit(void) +{ + int ret = UNIFYFS_SUCCESS; + + /* iterate over each active application and free resources */ + ABT_mutex_lock(app_configs_abt_sync); + for (int i = 0; i < MAX_NUM_APPS; i++) { + /* get pointer to app config for this app_id */ + app_config* app = app_configs[i]; + if (NULL != app) { + app_configs[i] = NULL; + unifyfs_rc rc = cleanup_application(app); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } + } + } + ABT_mutex_unlock(app_configs_abt_sync); + + /* TODO: notify the service threads to exit */ + + /* finalize kvstore service*/ + LOGDBG("finalizing kvstore service"); + unifyfs_keyval_fini(); + + /* shutdown rpc service + * (note: this needs to happen after app-client cleanup above) */ + LOGDBG("stopping rpc service"); + margo_server_rpc_finalize(); + +#if defined(USE_MDHIM) + /* shutdown the metadata service*/ + LOGDBG("stopping metadata service"); + meta_sanitize(); +#endif + +#if defined(UNIFYFSD_USE_MPI) + LOGDBG("finalizing MPI"); + fini_MPI(); +#endif + + LOGDBG("all done!"); + unifyfs_log_close(); + + return ret; +} + +/* get pointer to app config for this app_id */ +app_config* get_application(int app_id) +{ + ABT_mutex_lock(app_configs_abt_sync); + for (int i = 0; i < MAX_NUM_APPS; i++) { + app_config* app_cfg = app_configs[i]; + if ((NULL != app_cfg) && (app_cfg->app_id == app_id)) { + ABT_mutex_unlock(app_configs_abt_sync); + return app_cfg; + } + } + ABT_mutex_unlock(app_configs_abt_sync); + return NULL; +} + +/* insert a new app config in app_configs[] */ +app_config* new_application(int app_id) +{ + ABT_mutex_lock(app_configs_abt_sync); + + /* don't have an app_config for this app_id, + * so allocate and fill a new one */ + app_config* new_app = (app_config*) calloc(1, sizeof(app_config)); + if (NULL == new_app) { + LOGERR("failed to allocate application structure") + ABT_mutex_unlock(app_configs_abt_sync); + return NULL; + } + + new_app->app_id = app_id; + + /* insert the given app_config in an empty slot */ + for (int i = 0; i < MAX_NUM_APPS; i++) { + app_config* existing = app_configs[i]; + if (NULL == existing) { + new_app->clients = (app_client**) calloc(clients_per_app, + sizeof(app_client*)); + if (NULL == new_app->clients) { + LOGERR("failed to allocate application clients arrays") + ABT_mutex_unlock(app_configs_abt_sync); + return NULL; + } + new_app->clients_sz = clients_per_app; + app_configs[i] = new_app; + ABT_mutex_unlock(app_configs_abt_sync); + return new_app; + } else if (existing->app_id == app_id) { + /* someone beat us to it, use existing */ + LOGDBG("found existing application for id=%d", app_id); + ABT_mutex_unlock(app_configs_abt_sync); + free(new_app); + return existing; + } + } + + ABT_mutex_unlock(app_configs_abt_sync); + + /* no empty slots found */ + LOGERR("insert into app_configs[] failed"); + free(new_app); + return NULL; +} + +/* free application state + * + * NOTE: the application state mutex (app_configs_abt_sync) should be locked + * before calling this function + */ +unifyfs_rc cleanup_application(app_config* app) +{ + unifyfs_rc ret = UNIFYFS_SUCCESS; + + if (NULL == app) { + return EINVAL; + } + + int app_id = app->app_id; + LOGDBG("cleaning application %d", app_id); + + /* free resources allocated for each client */ + for (int j = 0; j < app->clients_sz; j++) { + app_client* client = app->clients[j]; + if (NULL != client) { + unifyfs_rc rc = cleanup_app_client(app, client); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } + } + } + if (NULL != app->clients) { + free(app->clients); + } + free(app); + + return ret; +} + +app_client* get_app_client(int app_id, + int client_id) +{ + /* get pointer to app structure for this app id */ + app_config* app_cfg = get_application(app_id); + if ((NULL == app_cfg) || + (client_id <= 0) || + (client_id > (int)app_cfg->clients_sz)) { + return NULL; + } + + /* clients array index is (id - 1) */ + int client_ndx = client_id - 1; + return app_cfg->clients[client_ndx]; +} + +/** + * Attach to the server-side of client shared memory regions. + * @param client: client information + * @return success|error code + */ +static unifyfs_rc attach_to_client_shmem(app_client* client, + size_t shmem_data_sz, + size_t shmem_super_sz) +{ + shm_context* shm_ctx; + char shm_name[SHMEM_NAME_LEN] = {0}; + + if (NULL == client) { + LOGERR("NULL client"); + return EINVAL; + } + + int app_id = client->app_id; + int client_id = client->client_id; + + /* initialize shmem region for client's superblock */ + sprintf(shm_name, SHMEM_SUPER_FMTSTR, app_id, client_id); + shm_ctx = unifyfs_shm_alloc(shm_name, shmem_super_sz); + if (NULL == shm_ctx) { + LOGERR("Failed to attach to shmem superblock region %s", shm_name); + return UNIFYFS_ERROR_SHMEM; + } + client->shmem_super = shm_ctx; + + /* initialize shmem region for read data */ + sprintf(shm_name, SHMEM_DATA_FMTSTR, app_id, client_id); + shm_ctx = unifyfs_shm_alloc(shm_name, shmem_data_sz); + if (NULL == shm_ctx) { + LOGERR("Failed to attach to shmem data region %s", shm_name); + return UNIFYFS_ERROR_SHMEM; + } + client->shmem_data = shm_ctx; + + /* initialize shmem header in data region */ + shm_data_header* shm_hdr = (shm_data_header*) client->shmem_data->addr; + pthread_mutex_init(&(shm_hdr->sync), NULL); + shm_hdr->meta_cnt = 0; + shm_hdr->bytes = 0; + shm_hdr->state = SHMEM_REGION_EMPTY; + + return UNIFYFS_SUCCESS; +} + +/** + * Initialize client state using passed values. + * + * Sets up logio and shmem region contexts, request manager thread, + * margo rpc address, etc. + */ +app_client* new_app_client(app_config* app, + const char* margo_addr_str, + const int debug_rank) +{ + if ((NULL == app) || (NULL == margo_addr_str)) { + return NULL; + } + + if (app->num_clients == app->clients_sz) { + LOGERR("reached maximum number of application clients"); + return NULL; + } + + ABT_mutex_lock(app_configs_abt_sync); + + int app_id = app->app_id; + int client_id = app->num_clients + 1; /* next client id */ + int client_ndx = client_id - 1; /* clients array index is (id - 1) */ + + app_client* client = (app_client*) calloc(1, sizeof(app_client)); + if (NULL != client) { + int failure = 0; + client->app_id = app_id; + client->client_id = client_id; + client->dbg_rank = debug_rank; + + /* convert client_addr_str to margo hg_addr_t */ + hg_return_t hret = margo_addr_lookup(unifyfsd_rpc_context->shm_mid, + margo_addr_str, + &(client->margo_addr)); + if (hret != HG_SUCCESS) { + failure = 1; + } + + /* create a request manager thread for this client */ + client->reqmgr = unifyfs_rm_thrd_create(app_id, client_id); + if (NULL == client->reqmgr) { + failure = 1; + } + + if (failure) { + LOGERR("failed to initialize application client"); + cleanup_app_client(app, client); + ABT_mutex_unlock(app_configs_abt_sync); + return NULL; + } + + /* update app state */ + app->num_clients++; + app->clients[client_ndx] = client; + } else { + LOGERR("failed to allocate client structure"); + } + + ABT_mutex_unlock(app_configs_abt_sync); + + return client; +} + +/** + * Attaches server to shared client state (e.g., logio and shmem regions) + */ +unifyfs_rc attach_app_client(app_client* client, + const char* logio_spill_dir, + const size_t logio_spill_size, + const size_t logio_shmem_size, + const size_t shmem_data_size, + const size_t shmem_super_size, + const size_t super_meta_offset, + const size_t super_meta_size) +{ + if (NULL == client) { + return EINVAL; + } + + int app_id = client->app_id; + int client_id = client->client_id; + int failure = 0; + + /* initialize server-side logio for this client */ + int rc = unifyfs_logio_init_server(app_id, client_id, + logio_shmem_size, + logio_spill_size, + logio_spill_dir, + &(client->logio)); + if (rc != UNIFYFS_SUCCESS) { + failure = 1; + } + + /* attach server-side shmem regions for this client */ + rc = attach_to_client_shmem(client, shmem_data_size, shmem_super_size); + if (rc != UNIFYFS_SUCCESS) { + failure = 1; + } + + if (failure) { + LOGERR("failed to attach application client"); + return UNIFYFS_FAILURE; + } + + client->super_meta_offset = super_meta_offset; + client->super_meta_size = super_meta_size; + client->connected = 1; + + return UNIFYFS_SUCCESS; +} + +/** + * Disconnect ephemeral client state, while maintaining access to any data + * the client wrote. + */ +unifyfs_rc disconnect_app_client(app_client* client) +{ + if (NULL == client) { + return EINVAL; + } + + if (!client->connected) { + /* already done */ + return UNIFYFS_SUCCESS; + } + + client->connected = 0; + + /* stop client request manager thread */ + if (NULL != client->reqmgr) { + rm_request_exit(client->reqmgr); + } + + /* free margo client address */ + margo_addr_free(unifyfsd_rpc_context->shm_mid, + client->margo_addr); + + /* release client shared memory regions */ + if (NULL != client->shmem_data) { + /* Release read buffer shared memory region. + * Client should have deleted file already, but will not hurt + * to do this again. */ + unifyfs_shm_unlink(client->shmem_data); + unifyfs_shm_free(&(client->shmem_data)); + } + if (NULL != client->shmem_super) { + /* Release superblock shared memory region. + * Server is responsible for deleting superblock shared + * memory file that was created by the client. */ + unifyfs_shm_unlink(client->shmem_super); + unifyfs_shm_free(&(client->shmem_super)); + } + + return UNIFYFS_SUCCESS; +} + +/** + * Cleanup any client state that has been setup in preparation for + * server exit. + * + * This function may be called due to a failed initialization, so we can't + * assume any particular state is valid, other than app_id and client_id. + * + * NOTE: the application state mutex (app_configs_abt_sync) should be locked + * before calling this function + */ +unifyfs_rc cleanup_app_client(app_config* app, app_client* client) +{ + if ((NULL == app) || (NULL == client)) { + return EINVAL; + } + + LOGDBG("cleaning application client %d:%d", + client->app_id, client->client_id); + + disconnect_app_client(client); + + /* close client logio context */ + if (NULL != client->logio) { + unifyfs_logio_close(client->logio, 1); + client->logio = NULL; + } + + /* reset app->clients array index if set */ + int client_ndx = client->client_id - 1; /* client ids start at 1 */ + if (client == app->clients[client_ndx]) { + app->clients[client_ndx] = NULL; + } + + /* free client structure */ + if (NULL != client->reqmgr) { + free(client->reqmgr); + client->reqmgr = NULL; + } + free(client); + + return UNIFYFS_SUCCESS; +} diff --git a/server/src/unifyfs_server_pid.c b/server/src/unifyfs_server_pid.c new file mode 100644 index 000000000..eee856578 --- /dev/null +++ b/server/src/unifyfs_server_pid.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include + +#include "unifyfs_configurator.h" +#include "unifyfs_global.h" +#include "margo_server.h" +#include "unifyfs_server_rpcs.h" + +extern unifyfs_cfg_t server_cfg; + +static int* server_pids; // = NULL +static pthread_cond_t server_pid_cond = PTHREAD_COND_INITIALIZER; +static pthread_mutex_t server_pid_mutex = PTHREAD_MUTEX_INITIALIZER; +static struct timespec server_pid_timeout; + +static int alloc_server_pids(void) +{ + int ret = 0; + pthread_mutex_lock(&server_pid_mutex); + if (NULL == server_pids) { + server_pids = (int*) calloc(glb_pmi_size, sizeof(int)); + if (NULL == server_pids) { + LOGERR("failed to allocate memory (%s)", strerror(errno)); + ret = ENOMEM; + } + } + pthread_mutex_unlock(&server_pid_mutex); + return ret; +} + +static int server_pid_invoke_rpc(void) +{ + int ret = 0; + hg_return_t hret = 0; + hg_handle_t handle; + server_pid_in_t in; + server_pid_out_t out; + + in.rank = glb_pmi_rank; + in.pid = server_pid; + + hret = margo_create(unifyfsd_rpc_context->svr_mid, + glb_servers[0].margo_svr_addr, + unifyfsd_rpc_context->rpcs.server_pid_id, + &handle); + if (hret != HG_SUCCESS) { + LOGERR("failed to create rpc handle (ret=%d)", hret); + return UNIFYFS_ERROR_MARGO; + } + + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("failed to forward rpc (ret=%d)", hret); + return UNIFYFS_ERROR_MARGO; + } + + hret = margo_get_output(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("failed to get rpc result (ret=%d)", hret); + return UNIFYFS_ERROR_MARGO; + } + + ret = out.ret; + + margo_free_output(handle, &out); + margo_destroy(handle); + + return ret; +} + +static void server_pid_rpc(hg_handle_t handle) +{ + int ret = 0; + hg_return_t hret = 0; + server_pid_in_t in; + server_pid_out_t out; + + hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("failed to get input (ret=%d)", hret); + return; + } + + ret = alloc_server_pids(); + if (ret) { + LOGERR("failed to allocate pid array"); + return; + } + assert((int)in.rank < glb_pmi_size); + pthread_mutex_lock(&server_pid_mutex); + server_pids[in.rank] = (int) in.pid; + pthread_mutex_unlock(&server_pid_mutex); + + out.ret = 0; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("failed to respond rpc (ret=%d)", hret); + return; + } + + margo_free_input(handle, &in); + margo_destroy(handle); + + ret = pthread_cond_signal(&server_pid_cond); + if (ret) { + LOGERR("failed to signal condition (%s)", strerror(ret)); + } +} +DEFINE_MARGO_RPC_HANDLER(server_pid_rpc); + +static inline int set_pidfile_timeout(void) +{ + int ret = 0; + long timeout_sec = 0; + + if (server_cfg.server_init_timeout) { + ret = configurator_int_val(server_cfg.server_init_timeout, + &timeout_sec); + if (ret) { + LOGERR("failed to read configuration"); + return ret; + } + } + + clock_gettime(CLOCK_REALTIME, &server_pid_timeout); + server_pid_timeout.tv_sec += timeout_sec; + + return 0; +} + +static int create_server_pid_file(void) +{ + int i = 0; + int ret = 0; + char filename[PATH_MAX] = { 0, }; + FILE* fp = NULL; + + if (!server_pids) { + LOGERR("cannot access the server pids"); + return EINVAL; + } + + sprintf(filename, "%s/%s", server_cfg.sharedfs_dir, UNIFYFSD_PID_FILENAME); + + fp = fopen(filename, "w"); + if (!fp) { + LOGERR("failed to create file %s (%s)", filename, strerror(errno)); + return errno; + } + + for (i = 0; i < glb_pmi_size; i++) { + fprintf(fp, "[%d] %d\n", i, server_pids[i]); + } + + fclose(fp); + + return ret; +} + +int unifyfs_publish_server_pids(void) +{ + int ret = UNIFYFS_SUCCESS; + + if (glb_pmi_rank > 0) { + /* publish my pid to server 0 */ + ret = server_pid_invoke_rpc(); + if (ret) { + LOGERR("failed to invoke pid rpc (%s)", strerror(ret)); + } + } else { + ret = alloc_server_pids(); + if (ret) { + return ret; + } + + ret = set_pidfile_timeout(); + if (ret) { + return ret; + } + + pthread_mutex_lock(&server_pid_mutex); + server_pids[0] = server_pid; + + /* keep checking count of reported servers until all have reported + * or we hit the timeout */ + do { + int count = 0; + for (int i = 0; i < glb_pmi_size; i++) { + if (server_pids[i] > 0) { + count++; + } + } + if (count == glb_pmi_size) { + ret = create_server_pid_file(); + if (UNIFYFS_SUCCESS == ret) { + LOGDBG("servers ready to accept client connections"); + } + break; + } + ret = pthread_cond_timedwait(&server_pid_cond, + &server_pid_mutex, + &server_pid_timeout); + if (ETIMEDOUT == ret) { + LOGERR("some servers failed to initialize within timeout"); + break; + } else if (ret) { + LOGERR("failed to wait on condition (err=%d, %s)", + errno, strerror(errno)); + break; + } + } while (1); + + free(server_pids); + server_pids = NULL; + + pthread_mutex_unlock(&server_pid_mutex); + } + + return ret; +} + diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index c3ed6689c..47955d0bd 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -27,135 +27,20 @@ * Please read https://github.com/llnl/burstfs/LICENSE for full license text. */ -#include -#include -#include - #include "unifyfs_global.h" #include "unifyfs_request_manager.h" #include "unifyfs_service_manager.h" #include "unifyfs_server_rpcs.h" #include "margo_server.h" -/* The service manager thread runs in a loop waiting for incoming - * requests. When it receives a message, it unpacks all read - * requests and appends them to the sm->service_msgs list. It delays - * for some time in acting on requests in the hopes of buffering - * read bursts to make I/O more efficient. If no read request - * has come in, and the delay time out has expired, and there are - * pening read requests to service, then it services all requests. - * - * It first creates a set of read tasks based on the set of read - * requests. The read requests are sorted by source file and then - * by offset, and read requests that refer to contiguous data - * regions are merged into a large read task. Read tasks are - * added to a read_task list. - * - * The read tasks are executed to copy data into a read buffer. - * Data that is copied from shared memory simply uses memcpy(). - * Data that spans shared memory and the spillover file uses - * memcpy with pread. Data that is fully in the spillover file - * are read with async I/O (aio) operations. The aio operations - * are added to a pending_reads queue that is later waited on. - * - * After all data has been read, results are packed into send - * buffers. This is done by matching read tasks with corresponding - * read requests. When send buffers are filled with read replies - * (acks), they are sent back to requesting delegators with - * MPI_Isend calls. The MPI_Request objects for those isends are - * added to a pending_sends list, which is later waited on. - * - * After replying to all sm->service_msgs, the service manager - * thread again starts listening for more incoming requests */ - -/* records info needed to build read reply to send back to - * requesting delegator, these are appened in an ack list - * which records a set of read replies before being sent */ -typedef struct { - recv_msg_t msg; /* header information for read reply */ - char* addr; /* address of data in read buffer */ -} ack_meta_t; - -/* this records info about outstanding sends to delegators, - * including the MPI request that must be waited on to - * determine when send has completed, these are added to - * an pending sends list when sent and then iterated over - * while waiting for all pending sends to complete */ -typedef struct { - MPI_Request req; /* MPI request for outstanding isend */ - MPI_Status stat; /* status for test call */ - int src_rank; /* target delegator rank */ - int src_thrd; /* target delegator thread */ -} ack_stat_t; - -/* records info about read tasks, which are generated by - * unpacking incoming read requests from remote delegators, - * it encodes information about a range of read requests - * that generated the read task, read requests that refer - * to contiguous data may be merged into a single read task, - * and the app_id and client_id are used to determine - * the memory/files holding the source data */ -typedef struct { - size_t size; /* size of read operation */ - int start_idx; /* service_msgs_t.msg[] index of starting request */ - int end_idx; /* service_msgs_t.msg[] index of ending request */ - int app_id; /* app id holding requested data */ - int cli_id; /* client id holding requested data */ - int arrival_time; /* time stamp when read request arrived */ -} read_task_t; - -/* defines an array of read tasks */ -typedef struct { - read_task_t* read_tasks; /* list of read tasks */ - int num; /* number of active read tasks */ - int cap; /* total capacity of read task list */ -} task_set_t; - -/* defines an array of read requests, generated by unpacking - * read requests from requesting delegators */ -typedef struct { - send_msg_t* msg; /* buffer of read requests */ - int num; /* number of active read requests in buffer */ - int cap; /* total capacity of read request list */ -} service_msgs_t; - -/* tracks an outstanding read operation, these are generated - * from read tasks and added to a list of pending read operations, - * which are later waited on before sending data back to requesting - * delegators */ -typedef struct { - int err_submit; /* indicates whether read was submitted */ - struct aiocb read_cb; /* structure for outstanding aio read */ - int index; /* index in read task list for this read */ - int start_pos; /* starting byte offset into read request */ - int end_pos; /* ending byte offset into read request */ - char* mem_pos; /* buffer holding read data */ -} pended_read_t; - -/* defines a list of read replies to be sent to a delegator */ -typedef struct { - arraylist_t* ack_list; /* list of read replies for delegator */ - int rank_id; /* rank of remote delegator to send to */ - int thrd_id; /* thread id of remote delegator */ - int src_cli_rank; /* rank of client that initiated read */ - size_t src_sz; /* total data size in read replies */ - int start_cursor; /* offset within ack_list */ -} rank_ack_meta_t; - -/* defines an list of reply data for different delegators, - * list is ordered by (rank,thread) of delegator for fast lookup */ -typedef struct { - rank_ack_meta_t* ack_metas; /* read reply data for a delegator */ - int num; /* number of items in list */ -} rank_ack_task_t; - -/* Service Manager state */ +/* Service Manager (SM) state */ typedef struct { /* the SM thread */ pthread_t thrd; - /* state synchronization mutex */ - pthread_mutex_t sync; + /* argobots mutex for synchronizing access to request state between + * margo rpc handler ULTs and SM thread */ + ABT_mutex sync; /* thread status */ int initialized; @@ -164,1082 +49,35 @@ typedef struct { /* thread return status code */ int sm_exit_rc; - /* list of chunk read requests from remote delegators */ + /* list of chunk read requests from remote servers */ arraylist_t* chunk_reads; - /* records list of read requests from requesting delegators */ - service_msgs_t service_msgs; - - /* list of read tasks that must be executed, - * generated from read requests */ - task_set_t read_task_set; - - /* list of read reply data for each delegator */ - rank_ack_task_t rank_ack_task; - - /* list of outstanding read operations */ - arraylist_t* pended_reads; - - /* list of outstanding send operations */ - arraylist_t* pended_sends; - - /* list of buffers to be used in send operations */ - arraylist_t* send_buf_list; - - /* tracks running total of bytes in current read burst */ - long burst_data_sz; - - /* buffer to hold read data while gathering it from source - * memory/files and before being packed into send buffers - * for read replies */ - char* read_buf; } svcmgr_state_t; svcmgr_state_t* sm; // = NULL +/* lock macro for debugging SM locking */ #define SM_LOCK() \ do { \ LOGDBG("locking service manager state"); \ - pthread_mutex_lock(&(sm->sync)); \ + ABT_mutex_lock(sm->sync); \ } while (0) +/* unlock macro for debugging SM locking */ #define SM_UNLOCK() \ do { \ LOGDBG("unlocking service manager state"); \ - pthread_mutex_unlock(&(sm->sync)); \ + ABT_mutex_unlock(sm->sync); \ } while (0) - -/* sort read requests into log files by - * app id, then client id, then log offset */ -static int compare_send_msg(const void* a, const void* b) -{ - const send_msg_t* ptr_a = a; - const send_msg_t* ptr_b = b; - - if (ptr_a->dest_app_id > ptr_b->dest_app_id) { - return 1; - } - - if (ptr_a->dest_app_id < ptr_b->dest_app_id) { - return -1; - } - - if (ptr_a->dest_client_id > ptr_b->dest_client_id) { - return 1; - } - - if (ptr_a->dest_client_id < ptr_b->dest_client_id) { - return -1; - } - - if (ptr_a->dest_offset > ptr_b->dest_offset) { - return 1; - } - - if (ptr_a->dest_offset < ptr_b->dest_offset) { - return -1; - } - - return 0; -} - -/* starts a new read task based on read request in sm->service_msgs - * at given index */ -static void create_msg_read_task(int index) -{ - /* get pointer to service message at given index */ - send_msg_t* msg = &sm->service_msgs.msg[index]; - - /* get pointer to current read task */ - int idx = sm->read_task_set.num; - read_task_t* read_task = &sm->read_task_set.read_tasks[idx]; - - /* copy fields from message to read task */ - read_task->start_idx = index; - read_task->end_idx = index; - read_task->size = msg->length; - read_task->app_id = msg->dest_app_id; - read_task->cli_id = msg->dest_client_id; - read_task->arrival_time = msg->arrival_time; -} - -/* - * Cluster read requests based on file offset and age. - * Each log file is uniquely identified by client-side app_id - * and client_id, so we can locate the target log file - * (generated by the client-side program). +/* Decode and issue chunk-reads received from request manager. + * We get a list of read requests for data on our node. Read + * data for each request and construct a set of read replies + * that will be sent back to the request manager. * - * @return success/error - */ -static int sm_cluster_reads(void) -{ - /* sort service messages by log file (app_id, client_id) - * and then by offset within each log file */ - qsort(sm->service_msgs.msg, sm->service_msgs.num, - sizeof(send_msg_t), compare_send_msg); - - /* initialize count of read tasks */ - sm->read_task_set.num = 0; - - /* create read task given first service message */ - create_msg_read_task(0); - sm->read_task_set.num++; - - /* iterate over each service message and create read tasks, - * will merge multiple read requests into read tasks - * when two requests refer to contiguous data in a log file */ - int i; - for (i = 1; i < sm->service_msgs.num; i++) { - /* get pointer to current service message */ - send_msg_t* msg = &sm->service_msgs.msg[i]; - - /* get pointer to preivous read task */ - read_task_t* last_read = - &sm->read_task_set.read_tasks[sm->read_task_set.num - 1]; - - /* check whether current message reads from the same log file, - * as our last read task */ - if ((last_read->app_id != msg->dest_app_id) || - (last_read->cli_id != msg->dest_client_id)) { - /* reading on a different local log file, - * so create a new read task for this message */ - create_msg_read_task(i); - sm->read_task_set.num++; - } else { - /* this message reads from the same log file as our last - * read request */ - - /* get pointer to previous read request */ - send_msg_t* last_msg = &sm->service_msgs.msg[i - 1]; - - /* see if we can tack current message on to - * previous read request */ - size_t last_offset = last_msg->dest_offset + last_msg->length; - if (last_offset == msg->dest_offset) { - /* current message starts where last read request - * ends, so append it to last read request if no larger - * than read_block_size */ - - /* the size of individual read should be smaller - * than read_block_size, if read size is larger it - * needs to be split into the unit of READ_BLOCK_SIZE */ - if ((last_read->size + msg->length) <= READ_BLOCK_SIZE) { - /* tack current message on previous read request */ - last_read->end_idx = i; - last_read->size += msg->length; - - /* update minimum arrival time */ - if (msg->arrival_time < last_read->arrival_time) { - last_read->arrival_time = msg->arrival_time; - } - } else { - /* if larger than read block size, start a new - * read task, here the data size requested by - * individual read request should be smaller - * than read_block_size. The larger one - * has already been split by the initiator */ - create_msg_read_task(i); - sm->read_task_set.num++; - } - } else { - /* not contiguous from the last offset, - * start a new read request */ - create_msg_read_task(i); - sm->read_task_set.num++; - } - } - } - - return UNIFYFS_SUCCESS; -} - -/* compare by rank and then thread in increasing order */ -static int compare_rank_thrd(int src_rank, int src_thrd, - int cmp_rank, int cmp_thrd) -{ - if (src_rank > cmp_rank) { - return 1; - } - if (src_rank < cmp_rank) { - return -1; - } - if (src_thrd > cmp_thrd) { - return 1; - } - if (src_thrd < cmp_thrd) { - return -1; - } - return 0; -} - -/* Returns index where delegator (rank, thread), - * should be in list, caller must check whether - * delegator at that position matches */ -static int find_ack_meta(int src_rank, int src_thrd, - int* found) -{ - /* assume we won't find item */ - *found = 0; - - /* if nothing in list, place this as first item */ - if (sm->rank_ack_task.num == 0) { - return 0; - } - - rank_ack_meta_t* metas = sm->rank_ack_task.ack_metas; - - /* if list has one item, compare to that item */ - if (sm->rank_ack_task.num == 1) { - /* compare to first item */ - int cmp = compare_rank_thrd(src_rank, src_thrd, - metas[0].rank_id, metas[0].thrd_id); - if (cmp < 0) { - /* item is smaller than first element */ - return 0; - } else if (cmp > 0) { - /* item is smaller than first element */ - return 1; - } else { - /* item matches first element */ - *found = 1; - return 0; - } - } - - /* execute binary search for item location in list */ - int left = 0; - int right = sm->rank_ack_task.num - 1; - int mid = (left + right) / 2; - while (right > left + 1) { - /* compare item to middle element */ - int cmp = compare_rank_thrd(src_rank, src_thrd, - metas[mid].rank_id, metas[mid].thrd_id); - if (cmp > 0) { - /* item is larger than middle item, - * so bump left range up */ - left = mid; - } else if (cmp < 0) { - /* item is smaller than middle item, - * so move right down to middle */ - right = mid; - } else { - /* found an exact match with middle element */ - *found = 1; - return mid; - } - - /* update middle */ - mid = (left + right) / 2; - } - - /* two elements left in the list, compare to left element */ - int cmp_left = compare_rank_thrd(src_rank, src_thrd, - metas[left].rank_id, - metas[left].thrd_id); - if (cmp_left < 0) { - /* item should come before left element */ - return left; - } else if (cmp_left == 0) { - /* item matches left item */ - *found = 1; - return left; - } - - /* item is larger than left element, so compare to right */ - int cmp_right = compare_rank_thrd(src_rank, src_thrd, - metas[right].rank_id, - metas[right].thrd_id); - if (cmp_right < 0) { - /* item should come before right element */ - return right; - } else if (cmp_right == 0) { - /* item matches right element */ - *found = 1; - return right; - } - - /* otherwise, item must be larger than right element */ - return (right + 1); -} - -/* Insert read reply list for specified (rank_id, thrd_id) delegator - * into ack list, keep ordered by rank,thread for fast lookup */ -static int insert_ack_meta(ack_meta_t* ack, - int pos, - int rank_id, - int thrd_id, - int src_cli_rank) -{ - /* get pointer to array of read reply data for delegators */ - rank_ack_meta_t* metas = sm->rank_ack_task.ack_metas; - - /* check whether insert location is in middle of the list */ - if (pos < sm->rank_ack_task.num) { - /* need to insert in the middle, bump all entries - * past pos up a slot */ - int i; - for (i = sm->rank_ack_task.num - 1; i >= pos; i--) { - metas[i + 1] = metas[i]; - } - } - - /* get pointer to ack meta data structure */ - rank_ack_meta_t* ack_meta = &(metas[pos]); - - /* initialize with values */ - ack_meta->ack_list = arraylist_create(); - ack_meta->rank_id = rank_id; - ack_meta->thrd_id = thrd_id; - ack_meta->src_cli_rank = src_cli_rank; - ack_meta->src_sz = ack->msg.length; - ack_meta->start_cursor = 0; - - /* check that we were able to create a new ack_list */ - if (ack_meta->ack_list == NULL) { - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* insert ack_meta into our list */ - arraylist_add(ack_meta->ack_list, ack); - - /* increment the number of entries in our list */ - sm->rank_ack_task.num++; - - return UNIFYFS_SUCCESS; -} - -/* Send back ack to the remote delegator. - * packs read replies into a send buffer, sends data with isend, - * adds record of send to pending sends list */ -static int sm_ack_remote_delegator(rank_ack_meta_t* ack_meta) -{ - int i; - - /* allocate more send buffers if we're at capacity */ - if (sm->send_buf_list->size == sm->send_buf_list->cap) { - /* at capacity in our list, allocate more space, - * double capacity in array */ - size_t new_cap = 2 * sm->send_buf_list->cap; - sm->send_buf_list->elems = (void**)realloc(sm->send_buf_list->elems, - (new_cap * sizeof(void*))); - - /* initialize pointers in new portion of array */ - for (i = sm->send_buf_list->cap; i < new_cap; i++) { - sm->send_buf_list->elems[i] = NULL; - } - - /* record new capacity */ - sm->send_buf_list->cap = new_cap; - } - - /* attempt to reuse allocated buffer if we can */ - if (sm->send_buf_list->elems[sm->send_buf_list->size] == NULL) { - /* need to allocate a new buffer */ - sm->send_buf_list->elems[sm->send_buf_list->size] = - malloc(SENDRECV_BUF_LEN); - } - - /* get pointer to send buffer */ - char* send_msg_buf = sm->send_buf_list->elems[sm->send_buf_list->size]; - sm->send_buf_list->size++; - - /* running total number of bytes we'll send */ - int send_sz = 0; - - /* compute number of read replies we'll send */ - size_t ack_count = arraylist_size(ack_meta->ack_list); - size_t ack_start = ack_meta->start_cursor; - int len = ack_count - ack_start; - - /* copy in number of read replies to message */ - memcpy(send_msg_buf + send_sz, &len, sizeof(int)); - send_sz += sizeof(int); - - /* pack read replies into send buffer */ - for (i = ack_start; i < ack_count; i++) { - /* get pointer to read reply header */ - ack_meta_t* meta = - (ack_meta_t*)arraylist_get(ack_meta->ack_list, i); - - /* copy read reply header to send buffer */ - memcpy(send_msg_buf + send_sz, &(meta->msg), sizeof(recv_msg_t)); - send_sz += sizeof(recv_msg_t); - - /* copy file data to send buffer */ - size_t length = (size_t) meta->msg.length; - memcpy(send_msg_buf + send_sz, meta->addr, length); - send_sz += (int) length; - } - - /* get rank and thread id of remote delegator */ - int del_rank = ack_meta->rank_id; - int del_thread = ack_meta->thrd_id; - - /* allocate a new ack stat structure to track details - * of pending send */ - ack_stat_t* ack_stat = (ack_stat_t*)malloc(sizeof(ack_stat_t)); - ack_stat->src_rank = del_rank; - ack_stat->src_thrd = del_thread; - - /* send read replies to delegator (rank and thread), - * record MPI request in ack stat structure to wait later */ - MPI_Isend(send_msg_buf, send_sz, MPI_BYTE, - del_rank, (int)READ_RESPONSE_TAG + del_thread, - MPI_COMM_WORLD, &(ack_stat->req)); - - /* add item to our list of pending sends */ - arraylist_add(sm->pended_sends, ack_stat); - - return UNIFYFS_SUCCESS; -} - -/* - * Insert a message to an entry of ack (read reply) list corresponding - * to its destination delegator. - * - * @param mem_addr : address of data to be acked in mem_pool - * @param index : identifies msg to be inserted to ack_lst - * @param src_offset: offset of the requested segment on the logical - * file (not the physical log file on SSD). - * e.g., for N-1 pattern, logical offset - * is the offset in the shared file - * @param len : length of the message - */ -static int insert_to_ack_list(char* mem_addr, - int index, - size_t src_offset, - size_t len, - int errcode) -{ - int rc = 0; - - /* get pointer to read request we are replying to */ - send_msg_t* msg = &sm->service_msgs.msg[index]; - - /* allocate a new structure to record ack meta data */ - ack_meta_t* ack = (ack_meta_t*)malloc(sizeof(ack_meta_t)); - - /* the src_offset might start from any position in the message, so - * make it a separate parameter */ - ack->msg.src_fid = msg->src_fid; /* global file id */ - ack->msg.src_offset = src_offset; /* offset in file */ - ack->msg.length = len; /* length of data */ - ack->msg.errcode = errcode; /* error code for read (pass/fail) */ - ack->addr = mem_addr; /* pointer to data in read buffer */ - - /* after setting the ack for this message, link it - * to a ack list based on its destination. */ - int src_rank = msg->src_delegator_rank; - int src_thrd = msg->src_thrd; - int src_cli_rank = msg->src_dbg_rank; - - /* find the position in the list for target delegator */ - int found = 0; - int pos = find_ack_meta(src_rank, src_thrd, &found); - - /* check whether delegator at this position is the target */ - if (!found) { - /* it's not, so insert new entry for the target into the - * list at this position */ - rc = insert_ack_meta(ack, pos, src_rank, src_thrd, src_cli_rank); - } else { - /* found the corresponding entry for target delegator */ - rank_ack_meta_t* ack_meta = &(sm->rank_ack_task.ack_metas[pos]); - - /* compute number of read replies waiting to be sent */ - int num_entries = arraylist_size(ack_meta->ack_list); - int num_to_ack = num_entries - ack_meta->start_cursor; - - /* number of bytes needed to pack existing read replies */ - size_t curr_bytes = (num_to_ack * sizeof(ack_meta_t)) + - ack_meta->src_sz; - - /* number of bytes to pack this read reply */ - size_t bytes = sizeof(ack_meta_t) + ack->msg.length; - - /* check whether we can fit this data into the - * existing send block */ - if (curr_bytes + bytes > SENDRECV_BUF_LEN) { - /* not enough room, send the current list of read replies */ - LOGDBG("early ack due to full send buffer"); - rc = sm_ack_remote_delegator(ack_meta); - - /* start a new list */ - ack_meta->src_sz = ack->msg.length; - arraylist_add(ack_meta->ack_list, ack); - - /* start_cursor records the starting ack - * for the next send */ - ack_meta->start_cursor += num_to_ack; - - /* check that our reads and sends completed ok */ - if (rc < 0) { - return (int)UNIFYFS_ERROR_SEND; - } - } else { - /* current read reply fits in send buffer, - * add it to the list */ - ack_meta->src_sz += ack->msg.length; - arraylist_add(ack_meta->ack_list, ack); - } - } - - return UNIFYFS_SUCCESS; -} - -/* Insert the data read for each element in read task list to read - * reply list, data will be sent later - * @param read_task : data returned from read task - * @param mem_addr : memory loc to copy read data - * @param start_offset: first offset in read task - * @param end_offset : last offset in read task - * @param errcode : error code on read (pass/fail) - */ -static int batch_insert_to_ack_list(read_task_t* read_task, - char* mem_addr, - int start_offset, - int end_offset, - int errcode) -{ - /* search for the starting read request in service msgs for - * a given region of read_task_t identified by start_offset - * and size of read task */ - - /* read_task - * start_offset end_offset - * read_task_t: -----------******************------------ - * service_msgs:[ ],[*********],[***], [******],[ ] - * - */ - - /* find index in read requests such that it contains starting - * offset for read task */ - int idx = read_task->start_idx; - int cur_offset = 0; - while (1) { - /* get pointer to current read request */ - send_msg_t* msg = &sm->service_msgs.msg[idx]; - - /* check whether end offset of this read request comes - * at or after starting offset of read task */ - if (cur_offset + msg->length >= start_offset) { - /* starting offset of read task falls between start - * and end offset of this read request */ - break; - } - - /* move on to next read request */ - cur_offset += msg->length; - idx++; - } - - /* compute leading bytes in read request that this read task - * does not overlap with */ - int skip_offset = start_offset - cur_offset; - - /* insert read replies for leading read request and all middle - * read requests */ - int first = 1; - int mem_pos = 0; - while (1) { - /* get pointer to read request */ - send_msg_t* msg = &sm->service_msgs.msg[idx]; - - /* stop if we reached the read request that contains the last - * byte of our read task */ - if (cur_offset + msg->length >= end_offset + 1) { - /* ending byte in read task comes before ending byte - * in read request */ - break; - } - - /* compute length and offset of read request that we cover with - * this read task, assume it's the whole read request */ - long length = msg->length; - long offset = msg->src_offset; - if (first == 1) { - /* in the first read request, the read task may start - * partway through, so skip any leading bytes */ - length = msg->length - skip_offset; - offset = msg->src_offset + skip_offset; - first = 0; - } - - /* add entry to read reply list */ - insert_to_ack_list(mem_addr + mem_pos, idx, offset, length, errcode); - - /* update our offset into read reply buffer */ - mem_pos += length; - - /* update offset into read task data */ - cur_offset += length; - - /* move on to next read request */ - idx++; - } - - /* the read task may not fully fill the ending read request */ - if (mem_pos < end_offset - start_offset + 1) { - /* compute remaining length of read task */ - long length = (end_offset - start_offset + 1) - mem_pos; - - /* starting offset for read request */ - long offset = sm->service_msgs.msg[idx].src_offset; - - /* add entry to read reply list */ - insert_to_ack_list(mem_addr + mem_pos, idx, offset, length, errcode); - } - - return UNIFYFS_SUCCESS; -} - -/* - * Wait until all data are read and sent. - * @return success/error - */ -static int sm_wait_until_digested(void) -{ - int i, rc, counter; - read_task_t* read_task; - - /* pointer to array of boolean flags for whether we need to test - * outstanding operations */ - int* flags = NULL; - - /* get number of pending read operations */ - int num_pended_reads = arraylist_size(sm->pended_reads); - if (num_pended_reads > 0) { - /* allocate space for pending flags, setting all to 0 */ - flags = (int*) calloc(num_pended_reads, sizeof(int)); - - /* wait for all pending read operations to complete */ - counter = 0; - while (1) { - /* check whether we have processed all reads */ - if (counter == num_pended_reads) { - LOGDBG("all pending reads completed"); - break; - } - LOGDBG("waiting for %d pending reads", - (num_pended_reads - counter)); - - /* iterate over pending reads */ - for (i = 0; i < num_pended_reads; i++) { - if (flags[i] != 1) { - int errcode = UNIFYFS_SUCCESS; - - /* get meta data for this pending read */ - pended_read_t* pr = - (pended_read_t*)arraylist_get(sm->pended_reads, i); - - read_task = NULL; - if (pr->index != -1) { - read_task = &(sm->read_task_set.read_tasks[pr->index]); - } - - if (pr->err_submit) { /* failed to submit */ - /* mark that this read operation failed */ - errcode = (int)UNIFYFS_ERROR_READ; - flags[i] = 1; - counter++; - - if (NULL != read_task) { - /* add read reply to ack_list as failed */ - batch_insert_to_ack_list(read_task, - pr->mem_pos, - pr->start_pos, - pr->end_pos, - errcode); - } - } else if (aio_error(&pr->read_cb) != EINPROGRESS) { - /* mark that this read operation has completed */ - flags[i] = 1; - counter++; - - /* check that read completed without error */ - ssize_t readrc = aio_return(&pr->read_cb); - if (readrc == -1) { - errcode = errno; - } else if (readrc != (ssize_t)pr->read_cb.aio_nbytes) { - /* short read considered as error */ - errcode = (int)UNIFYFS_ERROR_READ; - } - - if (NULL != read_task) { - /* add read reply to ack_list */ - batch_insert_to_ack_list(read_task, - pr->mem_pos, - pr->start_pos, - pr->end_pos, - errcode); - } - } - } - } - } - - free(flags); - flags = NULL; - - /* reset our list of pending read operations */ - arraylist_reset(sm->pended_reads); - } - - /* read operations have completed, - * send data to delegators */ - - /* iterate over list of delegators and send remaining acks */ - for (i = 0; i < sm->rank_ack_task.num; i++) { - /* get data structure for this delegator */ - rank_ack_meta_t* ack_meta = &(sm->rank_ack_task.ack_metas[i]); - - /* get total number of read replies in this list */ - int tmp_sz = arraylist_size(ack_meta->ack_list); - - /* if we have read replies we have yet to send, - * send them now */ - if (ack_meta->start_cursor < tmp_sz) { - /* got some read replies, send them */ - rc = sm_ack_remote_delegator(ack_meta); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to ack delegator"); - } - } - } - - /* sends issued, now wait for them to complete */ - - /* get number of outstanding sends - * initiated in sm_ack_remote_delegator */ - int num_pended_sends = arraylist_size(sm->pended_sends); - if (num_pended_sends > 0) { - /* allocate space for flags, setting all to 0 */ - flags = (int*)calloc(num_pended_sends, sizeof(int)); - - /* wait until all acks are sent */ - counter = 0; - while (1) { - /* check whether we have waited on all sends */ - if (counter == num_pended_sends) { - LOGDBG("all pending sends completed"); - break; - } - LOGDBG("waiting for %d pending sends", - (num_pended_sends - counter)); - - /* iterate over each send we issued */ - for (i = 0; i < num_pended_sends; i++) { - /* if send is still outstanding, check whether it's done */ - if (flags[i] == 0) { - /* still outstanding, get data for this send */ - ack_stat_t* ack_stat = arraylist_get(sm->pended_sends, i); - - /* test whether send is complete */ - rc = MPI_Test(&ack_stat->req, &flags[i], - &ack_stat->stat); - if (rc != MPI_SUCCESS) { - LOGERR("MPI_Test() for pending send failed"); - } - - /* if send completed, bump our counter */ - if (flags[i] != 0) { - counter++; - } - } - } - } - - free(flags); - flags = NULL; - - /* clear our list of sends */ - arraylist_reset(sm->pended_sends); - } - - /* reset our list of read replies */ - for (i = 0; i < sm->rank_ack_task.num; i++) { - rank_ack_meta_t* ack_meta = &(sm->rank_ack_task.ack_metas[i]); - arraylist_reset(ack_meta->ack_list); - ack_meta->rank_id = -1; - ack_meta->thrd_id = -1; - ack_meta->src_sz = 0; - ack_meta->start_cursor = 0; - } - sm->rank_ack_task.num = 0; - - /* TODO: might be nice to free some send buffers here */ - - /* set active send buffers back to 0, - * we do not free send buffers so that we do not have to - * allocate them again */ - sm->send_buf_list->size = 0; - - return UNIFYFS_SUCCESS; -} - -static int compare_read_task(const void* a, const void* b) -{ - const read_task_t* ptr_a = a; - const read_task_t* ptr_b = b; - - if (ptr_a->size > ptr_b->size) { - return 1; - } - - if (ptr_a->size < ptr_b->size) { - return -1; - } - - if (ptr_a->arrival_time > ptr_b->arrival_time) { - return 1; - } - - if (ptr_a->arrival_time < ptr_b->arrival_time) { - return -1; - } - - return 0; -} - -/* - * Read and send the data via pipelined read, copy and send - * @return success/error - */ -static int sm_read_send_pipe(void) -{ - LOGDBG("processing %d reads", sm->read_task_set.num); - - /* sort read tasks by size and then by arrival time */ - qsort(sm->read_task_set.read_tasks, sm->read_task_set.num, - sizeof(read_task_t), compare_read_task); - - /* points to offset in read reply buffer */ - size_t buf_cursor = 0; - - /* iterate over read tasks and pack read replies into send buffer */ - int i; - for (i = 0; i < sm->read_task_set.num; i++) { - /* get pointer to current read task */ - read_task_t* read_task = &(sm->read_task_set.read_tasks[i]); - - /* get size of data we are to read */ - size_t size = read_task->size; - - /* check whether we have room in the read buffer to hold data */ - if ((buf_cursor + size) > READ_BUF_SZ) { - /* no room, wait until reads complete and send - * out replies */ - LOGDBG("read buf exhausted"); - sm_wait_until_digested(); - - /* reset to start of read buffer */ - buf_cursor = 0; - } - - /* get app id and client id for this read task, - * defines log files holding data */ - int app_id = read_task->app_id; - int cli_id = read_task->cli_id; - - /* look up app config for given app id */ - app_config_t* app_config = (app_config_t*) - arraylist_get(app_config_list, app_id); - assert(app_config); - - /* get index of starting service message */ - int start_idx = read_task->start_idx; - - /* get offset in log file */ - send_msg_t* msg = &sm->service_msgs.msg[start_idx]; - size_t offset = msg->dest_offset; - - /* prepare read opertions based on data location */ - if ((offset + read_task->size) <= app_config->data_size) { - /* requested data in read_task is totally in shared memory, - * get pointer to position in shared memory */ - char* log_ptr = app_config->shm_superblocks[cli_id] + - app_config->data_offset + offset; - - /* copy data into read reply buffer */ - char* buf_ptr = sm->read_buf + buf_cursor; - memcpy(buf_ptr, log_ptr, size); - buf_cursor += size; - - /* we assume memcpy worked */ - int errcode = UNIFYFS_SUCCESS; - - /* prepare read reply meta data */ - batch_insert_to_ack_list(read_task, buf_ptr, - 0, size - 1, errcode); - } else if (offset < app_config->data_size) { - /* part of the requested data is in shared memory, - * compute size in shared memory */ - size_t sz_from_mem = app_config->data_size - offset; - - /* get pointer to position in shared memory */ - char* log_ptr = app_config->shm_superblocks[cli_id] + - app_config->data_offset + offset; - - /* copy data into read reply buffer */ - char* buf_ptr = sm->read_buf + buf_cursor; - memcpy(buf_ptr, log_ptr, sz_from_mem); - buf_cursor += sz_from_mem; - - /* we assume memcpy worked */ - int errcode = UNIFYFS_SUCCESS; - - /* compute size in spillover file */ - long sz_from_ssd = size - sz_from_mem; - - /* read data from spillover file */ - int fd = app_config->spill_log_fds[cli_id]; - ssize_t nread = pread(fd, sm->read_buf + buf_cursor, - sz_from_ssd, 0); - if (nread != (ssize_t)sz_from_ssd) { - /* read error or short read, - * consider either case to be an error */ - errcode = (int)UNIFYFS_ERROR_READ; - } - buf_cursor += sz_from_ssd; - - /* prepare read reply meta data */ - batch_insert_to_ack_list(read_task, buf_ptr, - 0, size - 1, errcode); - } else { - /* all requested data in the current read task - * are in spillover files */ - int fd = app_config->spill_log_fds[cli_id]; - - /* allocate empty pending read structure */ - pended_read_t* ptr = - (pended_read_t*)malloc(sizeof(pended_read_t)); - - /* fill in aio fields */ - memset(&ptr->read_cb, 0, sizeof(struct aiocb)); - ptr->read_cb.aio_fildes = fd; - ptr->read_cb.aio_buf = sm->read_buf + buf_cursor; - ptr->read_cb.aio_offset = offset - app_config->data_size; - ptr->read_cb.aio_nbytes = size; - - /* index of read task for this pending read */ - ptr->index = i; - - /* offset locations in generating read request */ - ptr->start_pos = 0; - ptr->end_pos = size - 1; - - /* send buffer location to copy data when complete */ - ptr->mem_pos = sm->read_buf + buf_cursor; - - /* submit read as aio operation */ - ptr->err_submit = 0; - int rc = aio_read(&ptr->read_cb); - if (rc < 0) { - /* remember that we failed to submit this read */ - ptr->err_submit = 1; - } - buf_cursor += size; - - /* enqueue entry in our list of pending reads */ - arraylist_add(sm->pended_reads, ptr); - } - - /* update accounting for burst size */ - sm->burst_data_sz += size; - } - - /* have initiated all read tasks, wait for them to finish - * and send results to delegators */ - sm_wait_until_digested(); - - return UNIFYFS_SUCCESS; -} - -/* Decode the read-request message received from request_manager - * - * @param msg_buf: message buffer containing request(s) - * @return success/error code - */ -int sm_decode_msg(char* msg_buf) -{ - /* get pointer to start of receive buffer */ - char* ptr = msg_buf; - - /* advance past command */ - ptr += sizeof(int); - - /* extract number of read requests in message */ - int num = *((int*)ptr); - ptr += sizeof(int); - - /* get pointer to read request */ - send_msg_t* msg = (send_msg_t*)ptr; - - assert(NULL != sm); - SM_LOCK(); - - /* get current timestamp as integer */ - int now = (int)time(NULL); - - LOGDBG("decoding %d requests", num); - - /* allocate a larger array of service requests if needed */ - if (num + sm->service_msgs.num >= sm->service_msgs.cap) { - /* get a larger buffer (2x what is currently needed) */ - size_t count = 2 * (num + sm->service_msgs.num); - - /* allocate larger buffer (2x what is currently needed) */ - size_t bytes = count * sizeof(send_msg_t); - sm->service_msgs.msg = - (send_msg_t*)realloc(sm->service_msgs.msg, bytes); - if (sm->service_msgs.msg == NULL) { - /* failed to allocate memory */ - SM_UNLOCK(); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* got the memory, increase the capacity */ - sm->service_msgs.cap = count; - - /* allocate corresponding space for read tasks */ - bytes = count * sizeof(read_task_t); - sm->read_task_set.read_tasks = - (read_task_t*)realloc(sm->read_task_set.read_tasks, bytes); - if (sm->read_task_set.read_tasks == NULL) { - /* failed to allocate memory */ - SM_UNLOCK(); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* got the memory, increase the capacity */ - sm->read_task_set.cap = count; - } - - /* unpack read requests to fill in service messages */ - int iter; - for (iter = 0; iter < num; iter++) { - /* copy values from read request */ - sm->service_msgs.msg[sm->service_msgs.num] = msg[iter]; - - /* set time stamp on when we received this request */ - sm->service_msgs.msg[sm->service_msgs.num].arrival_time = now; - - /* increment the number of service requests - * and go to next read request */ - sm->service_msgs.num++; - } - - SM_UNLOCK(); - - return UNIFYFS_SUCCESS; -} - -/* Decode and issue chunk-reads received from request manager - * - * @param src_rank : source delegator rank - * @param src_app_id : app id at source delegator - * @param src_client_id : client id at source delegator - * @param src_req_id : request id at source delegator + * @param src_rank : source server rank + * @param src_app_id : app id at source server + * @param src_client_id : client id at source server + * @param src_req_id : request id at source server * @param num_chks : number of chunk requests * @param msg_buf : message buffer containing request(s) * @return success/error code @@ -1258,129 +96,130 @@ int sm_issue_chunk_reads(int src_rank, ptr += sizeof(int); /* extract number of chunk read requests */ - int num = *((int*)ptr); + assert(num_chks == *((int*)ptr)); ptr += sizeof(int); - assert(num == num_chks); + /* total data size we'll be reading */ size_t total_data_sz = *((size_t*)ptr); ptr += sizeof(size_t); - /* get pointer to read request */ + /* get pointer to read request array */ chunk_read_req_t* reqs = (chunk_read_req_t*)ptr; - remote_chunk_reads_t* rcr = (remote_chunk_reads_t*) - calloc(1, sizeof(remote_chunk_reads_t)); - if (NULL == rcr) { - LOGERR("failed to allocate remote_chunk_reads"); - return UNIFYFS_ERROR_NOMEM; - } - rcr->rank = src_rank; - rcr->app_id = src_app_id; - rcr->client_id = src_client_id; - rcr->rdreq_id = src_req_id; - rcr->num_chunks = num_chks; - rcr->reqs = NULL; + /* we'll allocate a buffer to hold a list of chunk read response + * structures, one for each chunk, followed by a data buffer + * to hold all data for all reads */ + /* compute the size of that buffer */ size_t resp_sz = sizeof(chunk_read_resp_t) * num_chks; - size_t buf_sz = resp_sz + total_data_sz; - rcr->total_sz = buf_sz; + size_t buf_sz = resp_sz + total_data_sz; + /* allocate the buffer */ // NOTE: calloc() is required here, don't use malloc char* crbuf = (char*) calloc(1, buf_sz); if (NULL == crbuf) { LOGERR("failed to allocate chunk_read_reqs"); - free(rcr); - return UNIFYFS_ERROR_NOMEM; + return ENOMEM; } - chunk_read_resp_t* resp = (chunk_read_resp_t*)crbuf; - rcr->resp = resp; + /* the chunk read response array starts as the first + * byte in our buffer and the data buffer follows + * the read response array */ + chunk_read_resp_t* resp = (chunk_read_resp_t*)crbuf; char* databuf = crbuf + resp_sz; + /* allocate a struct for the chunk read request */ + server_chunk_reads_t* scr = (server_chunk_reads_t*) + calloc(1, sizeof(server_chunk_reads_t)); + if (NULL == scr) { + LOGERR("failed to allocate remote_chunk_reads"); + return ENOMEM; + } + + /* fill in chunk read request */ + scr->rank = src_rank; + scr->app_id = src_app_id; + scr->client_id = src_client_id; + scr->rdreq_id = src_req_id; + scr->num_chunks = num_chks; + scr->reqs = NULL; + scr->total_sz = buf_sz; + scr->resp = resp; + LOGDBG("issuing %d requests, total data size = %zu", num_chks, total_data_sz); - /* points to offset in read reply buffer */ + /* points to offset in read reply buffer to place + * data for next read */ size_t buf_cursor = 0; int i; - int last_app = -1; - app_config_t* app_config = NULL; + app_client* app_clnt = NULL; for (i = 0; i < num_chks; i++) { + /* pointer to next read request */ chunk_read_req_t* rreq = reqs + i; + + /* pointer to next read response */ chunk_read_resp_t* rresp = resp + i; - /* get size of data we are to read */ - size_t size = rreq->nbytes; - size_t offset = rreq->log_offset; + /* get size and log offset of data we are to read */ + size_t nbytes = rreq->nbytes; + size_t log_offset = rreq->log_offset; /* record request metadata in response */ - rresp->nbytes = size; - rresp->offset = rreq->offset; - LOGDBG("reading chunk(offset=%zu, size=%zu)", rreq->offset, size); + rresp->gfid = rreq->gfid; + rresp->read_rc = 0; + rresp->nbytes = nbytes; + rresp->offset = rreq->offset; + LOGDBG("reading chunk(offset=%zu, size=%zu)", + rreq->offset, nbytes); + + /* get pointer to next position in buffer to store read data */ + char* buf_ptr = databuf + buf_cursor; - /* get app id and client id for this read task, - * defines log files holding data */ + /* read data from client log */ int app_id = rreq->log_app_id; int cli_id = rreq->log_client_id; - if (app_id != last_app) { - /* look up app config for given app id */ - app_config = (app_config_t*) - arraylist_get(app_config_list, app_id); - assert(app_config); - last_app = app_id; - } - int spillfd = app_config->spill_log_fds[cli_id]; - char* log_ptr = app_config->shm_superblocks[cli_id] + - app_config->data_offset + offset; - - char* buf_ptr = databuf + buf_cursor; - - /* prepare read opertions based on data location */ - size_t sz_from_mem = 0; - size_t sz_from_spill = 0; - if ((offset + size) <= app_config->data_size) { - /* requested data is totally in shared memory */ - sz_from_mem = size; - } else if (offset < app_config->data_size) { - /* part of the requested data is in shared memory */ - sz_from_mem = app_config->data_size - offset; - sz_from_spill = size - sz_from_mem; - } else { - /* all requested data is in spillover file */ - sz_from_spill = size; - } - if (sz_from_mem > 0) { - /* read data from shared memory */ - memcpy(buf_ptr, log_ptr, sz_from_mem); - rresp->read_rc = sz_from_mem; - } - if (sz_from_spill > 0) { - /* read data from spillover file */ - ssize_t nread = pread(spillfd, (buf_ptr + sz_from_mem), - sz_from_spill, 0); - if (-1 == nread) { - rresp->read_rc = (ssize_t)(-errno); + app_clnt = get_app_client(app_id, cli_id); + if (NULL != app_clnt) { + logio_context* logio_ctx = app_clnt->logio; + if (NULL != logio_ctx) { + size_t nread = 0; + int rc = unifyfs_logio_read(logio_ctx, log_offset, nbytes, + buf_ptr, &nread); + if (UNIFYFS_SUCCESS == rc) { + rresp->read_rc = nread; + } else { + rresp->read_rc = (ssize_t)(-rc); + } } else { - rresp->read_rc += nread; + rresp->read_rc = (ssize_t)(-EINVAL); } + } else { + rresp->read_rc = (ssize_t)(-EINVAL); } - buf_cursor += size; - /* update accounting for burst size */ - sm->burst_data_sz += size; + /* update to point to next slot in read reply buffer */ + buf_cursor += nbytes; } - if (src_rank != glb_mpi_rank) { - /* add chunk_reads to svcmgr response list */ + if (src_rank != glb_pmi_rank) { + /* we need to send these read responses to another rank, + * add chunk_reads to svcmgr response list and another + * thread will take care of that */ LOGDBG("adding to svcmgr chunk_reads"); assert(NULL != sm); + SM_LOCK(); - arraylist_add(sm->chunk_reads, rcr); + arraylist_add(sm->chunk_reads, scr); SM_UNLOCK(); + + /* scr will be freed later by the sending thread */ + LOGDBG("done adding to svcmgr chunk_reads"); return UNIFYFS_SUCCESS; - } else { /* response is for myself */ + } else { + /* response is for myself, post it directly */ LOGDBG("responding to myself"); int rc = rm_post_chunk_read_responses(src_app_id, src_client_id, src_rank, src_req_id, @@ -1388,8 +227,10 @@ int sm_issue_chunk_reads(int src_rank, if (rc != (int)UNIFYFS_SUCCESS) { LOGERR("failed to handle chunk read responses"); } + /* clean up allocated buffers */ - free(rcr); + free(scr); + return rc; } } @@ -1397,100 +238,28 @@ int sm_issue_chunk_reads(int src_rank, /* initialize and launch service manager thread */ int svcmgr_init(void) { + /* allocate a service manager struct, + * store in global variable */ sm = (svcmgr_state_t*)calloc(1, sizeof(svcmgr_state_t)); if (NULL == sm) { LOGERR("failed to allocate service manager state!"); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* allocate a buffer to hold read data before packing - * into send buffers */ - sm->read_buf = malloc(READ_BUF_SZ); - if (sm->read_buf == NULL) { - LOGERR("failed to allocate service manager read buffer!"); - svcmgr_fini(); - return (int)UNIFYFS_ERROR_NOMEM; + return ENOMEM; } - /* tracks how much data we process in each burst */ - sm->burst_data_sz = 0; - /* allocate a list to track chunk read requests */ sm->chunk_reads = arraylist_create(); if (sm->chunk_reads == NULL) { LOGERR("failed to allocate service manager chunk_reads!"); svcmgr_fini(); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* initialize our data structure for holding read requests */ - size_t bytes = MAX_META_PER_SEND * sizeof(send_msg_t); - sm->service_msgs.msg = (send_msg_t*)malloc(bytes); - sm->service_msgs.num = 0; - sm->service_msgs.cap = MAX_META_PER_SEND; - - /* initialize our data structure for holding read tasks */ - bytes = MAX_META_PER_SEND * sizeof(read_task_t); - sm->read_task_set.read_tasks = (read_task_t*)malloc(bytes); - sm->read_task_set.num = 0; - sm->read_task_set.cap = MAX_META_PER_SEND; - - /* allocate a list to track pending data read operations */ - sm->pended_reads = arraylist_create(); - if (sm->pended_reads == NULL) { - LOGERR("failed to allocate service manager pended_reads!"); - svcmgr_fini(); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* allocate a list to track outstanding sends back to - * requesting delegators */ - sm->pended_sends = arraylist_create(); - if (sm->pended_sends == NULL) { - LOGERR("failed to allocate service manager pended_sends!"); - svcmgr_fini(); - return (int)UNIFYFS_ERROR_NOMEM; - } - - /* allocate a list to track pending send operations - * that need to be initiated back to requesting delegators */ - sm->send_buf_list = arraylist_create(); - if (sm->send_buf_list == NULL) { - LOGERR("failed to allocate service manager send_buf_list!"); - svcmgr_fini(); - return (int)UNIFYFS_ERROR_NOMEM; + return ENOMEM; } - /* allocate memory to hold meta data for read replies */ - bytes = glb_mpi_size * MAX_NUM_CLIENTS * sizeof(rank_ack_meta_t); - sm->rank_ack_task.num = 0; - sm->rank_ack_task.ack_metas = (rank_ack_meta_t*)malloc(bytes); - - /* initialize each ack_meta structure, keep one for each potential - * application client - * (num delegaotrs * max num clients per delegator) */ - int i; - for (i = 0; i < glb_mpi_size * MAX_NUM_CLIENTS; i++) { - /* get pointer to current structure */ - rank_ack_meta_t* ack_meta = &(sm->rank_ack_task.ack_metas[i]); - ack_meta->ack_list = NULL; - ack_meta->rank_id = -1; - ack_meta->thrd_id = -1; - ack_meta->src_sz = 0; - ack_meta->start_cursor = 0; - } - - int rc = pthread_mutex_init(&(sm->sync), NULL); - if (0 != rc) { - LOGERR("failed to initialize service manager mutex!"); - svcmgr_fini(); - return (int)UNIFYFS_ERROR_THRDINIT; - } + ABT_mutex_create(&(sm->sync)); sm->initialized = 1; - rc = pthread_create(&(sm->thrd), NULL, - sm_service_reads, (void*)sm); + int rc = pthread_create(&(sm->thrd), NULL, + service_manager_thread, (void*)sm); if (rc != 0) { LOGERR("failed to create service manager thread"); svcmgr_fini(); @@ -1505,38 +274,21 @@ int svcmgr_fini(void) { if (NULL != sm) { if (sm->thrd) { - // int exit_cmd = (int)SVC_CMD_EXIT; - // MPI_Send(&exit_cmd, sizeof(int), MPI_CHAR, glb_mpi_rank, - // (int)READ_REQUEST_TAG, MPI_COMM_WORLD); sm->time_to_exit = 1; pthread_join(sm->thrd, NULL); } + if (sm->initialized) { SM_LOCK(); } arraylist_free(sm->chunk_reads); - arraylist_free(sm->pended_reads); - arraylist_free(sm->pended_sends); - arraylist_free(sm->send_buf_list); - - if (NULL != sm->service_msgs.msg) { - free(sm->service_msgs.msg); - } - if (NULL != sm->read_task_set.read_tasks) { - free(sm->read_task_set.read_tasks); - } - - int i; - for (i = 0; i < sm->rank_ack_task.num; i++) { - arraylist_free(sm->rank_ack_task.ack_metas[i].ack_list); - } if (sm->initialized) { SM_UNLOCK(); - pthread_mutex_destroy(&(sm->sync)); } + /* free the service manager struct allocated during init */ free(sm); sm = NULL; } @@ -1546,185 +298,75 @@ int svcmgr_fini(void) /* iterate over list of chunk reads and send responses */ static int send_chunk_read_responses(void) { + /* assume we'll succeed */ int rc = (int)UNIFYFS_SUCCESS; - pthread_mutex_lock(&(sm->sync)); + + /* this will hold a list of chunk read requests if we find any */ + arraylist_t* chunk_reads = NULL; + + /* lock to access global service manager object */ + ABT_mutex_lock(sm->sync); + + /* if we have any chunk reads, take pointer to the list + * of chunk read requests and replace it with a newly allocated + * list on the service manager structure */ int num_chunk_reads = arraylist_size(sm->chunk_reads); if (num_chunk_reads) { + /* got some chunk read requets, take the list and replace + * it with an empty list */ LOGDBG("processing %d chunk read responses", num_chunk_reads); - for (int i = 0; i < num_chunk_reads; i++) { - /* get data structure */ - remote_chunk_reads_t* rcr = (remote_chunk_reads_t*) - arraylist_get(sm->chunk_reads, i); - rc = invoke_chunk_read_response_rpc(rcr); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to send chunk read responses"); - } - } - arraylist_reset(sm->chunk_reads); + chunk_reads = sm->chunk_reads; + sm->chunk_reads = arraylist_create(); } - pthread_mutex_unlock(&(sm->sync)); - return rc; -} -/* entry point for the service thread, service the read requests - * received from the requesting delegators, executes a loop constantly - * waiting on incoming message, for each message that comes in, - * appends read requests to sm->service_msgs list, if no message has - * arrived, and after some wait time (to catch bursty reads), - * then convert read requests into read tasks, execute read tasks - * to build read replies, and send read replies back to delegators */ -void* sm_service_reads(void* ctx) -{ - int rc; + /* release lock on service manager object */ + ABT_mutex_unlock(sm->sync); - LOGDBG("I am service manager thread!"); - assert(sm == (svcmgr_state_t*)ctx); - - /* received message format: - * cmd, req_num, a list of read requests */ - - /* buffer to hold received msg, - * since this is large, malloc it instead of declare it on stack - * (mitigate problems thread stack size limits) */ - char* req_msg_buf = (char*)malloc(REQ_BUF_LEN); - - /* counter for timed wait before creating read tasks, - * used to buffer multiple read requests before responding - * with the idea that reads come in bursts */ - int wait_time = 0; - - /* initialize value on how long to wait before processing - * incoming read requests */ - long bursty_interval = MAX_BURSTY_INTERVAL / 10; - - /* listen and server incoming requests until signaled to exit */ - MPI_Status status; - while (!sm->time_to_exit) { - /* post a receive for incoming request */ - MPI_Request request; - MPI_Irecv(req_msg_buf, REQ_BUF_LEN, MPI_BYTE, - MPI_ANY_SOURCE, (int)READ_REQUEST_TAG, - MPI_COMM_WORLD, &request); - - /* test whether we received anything */ - int irecv_flag = 0; - int mpi_rc = MPI_Test(&request, &irecv_flag, &status); - if (mpi_rc != MPI_SUCCESS) { - sm->sm_exit_rc = (int)UNIFYFS_ERROR_RECV; - return NULL; - } + /* iterate over each chunk read request */ + for (int i = 0; i < num_chunk_reads; i++) { + /* get next chunk read request */ + server_chunk_reads_t* scr = (server_chunk_reads_t*) + arraylist_get(chunk_reads, i); - send_chunk_read_responses(); - - /* as long as we keep receiving requests, we'll keep skipping - * the while loop below (and its sleep) and keep appending - * items to our read request queue */ - - /* - * keep receiving the read request - * until the end of a anticipated - * bursty behavior - * */ - while (!irecv_flag) { - - if (sm->time_to_exit) { - break; - } - - send_chunk_read_responses(); - - /* if we have not received anything, sleep */ - if (bursty_interval > MIN_SLEEP_INTERVAL) { - usleep(SLEEP_INTERVAL); /* wait an interval */ - wait_time += SLEEP_INTERVAL; - } - - /* a bursty behavior is considered end when - * wait time is larger than BURSTY_INTERVAL - * */ - if ((wait_time >= bursty_interval) || - (bursty_interval <= MIN_SLEEP_INTERVAL)) { - /* if time to wait has expired, and if we have some - * queued read requests, do some work */ - - pthread_mutex_lock(&(sm->sync)); - if (sm->service_msgs.num > 0) { - /* run through list of read requests and generate - * read tasks, merge requests for contiguous data - * into a single read task */ - rc = sm_cluster_reads(); - if (rc != 0) { - sm->sm_exit_rc = rc; - pthread_mutex_unlock(&(sm->sync)); - return NULL; - } + rc = invoke_chunk_read_response_rpc(scr); + } - /* execute read tasks, wait for them to complete, - * then pack read replies, send to delegators, and - * finally wait for sends to complete */ - rc = sm_read_send_pipe(); - if (rc != 0) { - sm->sm_exit_rc = rc; - pthread_mutex_unlock(&(sm->sync)); - return NULL; - } + /* free the list if we have one */ + if (NULL != chunk_reads) { + arraylist_free(chunk_reads); + } - /* have processed all read requests and read tasks, - * prep them for next message */ - sm->service_msgs.num = 0; - sm->read_task_set.num = 0; - - /* determine how long to wait next time based on - * how much data we just processed in this burst */ - if (sm->burst_data_sz >= LARGE_BURSTY_DATA) { - /* for large bursts above a threshold, - * wait for a fixed amount of time */ - bursty_interval = MAX_BURSTY_INTERVAL; - } else { - /* for smaller bursts, set delay proportionally - * to burst size we just processed */ - bursty_interval = - (SLEEP_SLICE_PER_UNIT * sm->burst_data_sz) / MIB; - } + return rc; +} - /* reset our burst size counter */ - sm->burst_data_sz = 0; - } - pthread_mutex_unlock(&(sm->sync)); +/* Entry point for service manager thread. The SM thread + * runs in a loop processing read request replies until + * the main server thread asks it to exit. The read requests + * themselves are handled by Margo RPC threads. + * + * @param arg: pointer to SM thread control structure + * @return NULL */ +void* service_manager_thread(void* arg) +{ + int rc; - /* reset our timer */ - wait_time = 0; - } + LOGDBG("I am the service manager thread!"); + assert(sm == (svcmgr_state_t*)arg); - /* test for receive again, will break while loop - * if we find something */ - mpi_rc = MPI_Test(&request, &irecv_flag, &status); - if (mpi_rc != MPI_SUCCESS) { - sm->sm_exit_rc = (int)UNIFYFS_ERROR_RECV; - return NULL; - } + /* handle chunk reads until signaled to exit */ + while (1) { + rc = send_chunk_read_responses(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to send chunk read responses"); } - if (irecv_flag) { - /* got a message, reset wait time */ - wait_time = 0; - - /* first value of receive buffer is integer holding command */ - int reqcmd = *((int*)req_msg_buf); - if (reqcmd == (int)SVC_CMD_RDREQ_MSG) { - /* got a request for data, append read requests in message - * to our sm->service_msgs list */ - sm_decode_msg(req_msg_buf); - } else if (reqcmd == (int)SVC_CMD_EXIT) { - /* time to exit */ - sm->time_to_exit = 1; - } + if (sm->time_to_exit) { + break; } - } - /* free receive buffer */ - free(req_msg_buf); - req_msg_buf = NULL; + /* wait an interval */ + usleep(MIN_SLEEP_INTERVAL); + } LOGDBG("service manager thread exiting"); @@ -1734,271 +376,179 @@ void* sm_service_reads(void* ctx) /* BEGIN MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ -/* invokes the chunk_read_response rpc */ -int invoke_chunk_read_response_rpc(remote_chunk_reads_t* rcr) +/* invokes the chunk_read_response rpc, this sends a set of read + * reply headers and corresponding data back to a server that + * had requested we read data on its behalf, the headers and + * data are posted as a bulk transfer buffer */ +int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr) { - int rc = (int)UNIFYFS_SUCCESS; - int dst_srvr_rank = rcr->rank; + /* assume we'll succeed */ + int rc = UNIFYFS_SUCCESS; + + /* rank of destination server */ + int dst_rank = scr->rank; + assert(dst_rank < (int)glb_num_servers); + + /* get address of destinaton server */ + hg_addr_t dst_addr = glb_servers[dst_rank].margo_svr_addr; + + /* pointer to struct containing rpc context info, + * shorter name for convience */ + ServerRpcContext_t* ctx = unifyfsd_rpc_context; + + /* get handle to read response rpc on destination server */ hg_handle_t handle; - chunk_read_response_in_t in; - chunk_read_response_out_t out; - hg_return_t hret; - hg_addr_t dst_srvr_addr; - hg_size_t bulk_sz = rcr->total_sz; - void* data_buf = (void*)rcr->resp; + hg_id_t resp_id = ctx->rpcs.chunk_read_response_id; + hg_return_t hret = margo_create(ctx->svr_mid, dst_addr, + resp_id, &handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_create() failed"); + return UNIFYFS_ERROR_MARGO; + } - assert(dst_srvr_rank < (int)glb_num_servers); - dst_srvr_addr = glb_servers[dst_srvr_rank].margo_svr_addr; + /* get address and size of our response buffer */ + void* data_buf = (void*)scr->resp; + hg_size_t bulk_sz = scr->total_sz; - hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, - unifyfsd_rpc_context->rpcs.chunk_read_response_id, - &handle); - assert(hret == HG_SUCCESS); + /* register our response buffer for bulk remote read access */ + chunk_read_response_in_t in; + hret = margo_bulk_create(ctx->svr_mid, 1, &data_buf, &bulk_sz, + HG_BULK_READ_ONLY, &in.bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + return UNIFYFS_ERROR_MARGO; + } /* fill in input struct */ - in.src_rank = (int32_t)glb_mpi_rank; - in.app_id = (int32_t)rcr->app_id; - in.client_id = (int32_t)rcr->client_id; - in.req_id = (int32_t)rcr->rdreq_id; - in.num_chks = (int32_t)rcr->num_chunks; + in.src_rank = (int32_t)glb_pmi_rank; + in.app_id = (int32_t)scr->app_id; + in.client_id = (int32_t)scr->client_id; + in.req_id = (int32_t)scr->rdreq_id; + in.num_chks = (int32_t)scr->num_chunks; in.bulk_size = bulk_sz; - /* register request buffer for bulk remote access */ - hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, - &data_buf, &bulk_sz, - HG_BULK_READ_ONLY, &in.bulk_handle); - assert(hret == HG_SUCCESS); - + /* call the read response rpc */ LOGDBG("invoking the chunk-read-response rpc function"); hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { - rc = (int)UNIFYFS_FAILURE; + LOGERR("margo_forward() failed"); + rc = UNIFYFS_ERROR_MARGO; } else { - /* decode response */ + /* rpc executed, now decode response */ + chunk_read_response_out_t out; hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { rc = (int)out.ret; LOGDBG("chunk-read-response rpc to %d - ret=%d", - dst_srvr_rank, rc); + dst_rank, rc); margo_free_output(handle, &out); } else { - rc = (int)UNIFYFS_FAILURE; + LOGERR("margo_get_output() failed"); + rc = UNIFYFS_ERROR_MARGO; } } + /* free resources allocated for executing margo rpc */ margo_bulk_free(in.bulk_handle); margo_destroy(handle); /* free response data buffer */ free(data_buf); - rcr->resp = NULL; + scr->resp = NULL; return rc; } /* BEGIN MARGO SERVER-SERVER RPC HANDLERS */ -/* handler for server-server hello - * - * print the message, and return my rank */ -static void server_hello_rpc(hg_handle_t handle) -{ - int rc, src_rank; - hg_return_t hret; - char* msg; - server_hello_in_t in; - server_hello_out_t out; - - /* get input params */ - rc = margo_get_input(handle, &in); - assert(rc == HG_SUCCESS); - src_rank = (int)in.src_rank; - msg = strdup(in.message_str); - if (NULL != msg) { - LOGDBG("got message '%s' from server %d", msg, src_rank); - free(msg); - } - - /* fill output structure to return to caller */ - out.ret = (int32_t)glb_mpi_rank; - - /* send output back to caller */ - hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); - - /* free margo resources */ - margo_free_input(handle, &in); - margo_destroy(handle); -} -DEFINE_MARGO_RPC_HANDLER(server_hello_rpc) - -/* handler for server-server request - * - * decode payload based on tag, and call appropriate svcmgr routine */ -static void server_request_rpc(hg_handle_t handle) -{ - int rc, req_id, src_rank, tag; - int32_t ret; - hg_return_t hret; - hg_bulk_t bulk_handle; - size_t bulk_sz; - server_request_in_t in; - server_request_out_t out; - - /* get input params */ - rc = margo_get_input(handle, &in); - assert(rc == HG_SUCCESS); - src_rank = (int)in.src_rank; - req_id = (int)in.req_id; - tag = (int)in.req_tag; - bulk_sz = (size_t)in.bulk_size; - - LOGDBG("handling request from server %d: tag=%d req=%d sz=%zu", - src_rank, tag, req_id, bulk_sz); - - /* get margo info */ - const struct hg_info* hgi = margo_get_info(handle); - assert(NULL != hgi); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - - int reqcmd = 0; - void* reqbuf = NULL; - if (bulk_sz) { - /* allocate and register local target buffer for bulk access */ - reqbuf = malloc(bulk_sz); - if (NULL == reqbuf) { - ret = (int32_t)UNIFYFS_ERROR_NOMEM; - goto request_out; - } - hret = margo_bulk_create(mid, 1, &reqbuf, &in.bulk_size, - HG_BULK_WRITE_ONLY, &bulk_handle); - assert(hret == HG_SUCCESS); - - /* pull request data */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_handle, 0, - bulk_handle, 0, in.bulk_size); - assert(hret == HG_SUCCESS); - reqcmd = *(int*)reqbuf; - } - - switch (tag) { - case (int)READ_REQUEST_TAG: { - /* verify this is a request for data */ - if (reqcmd == (int)SVC_CMD_RDREQ_MSG) { - LOGDBG("request command: SVC_CMD_RDREQ_MSG"); - /* got a request for data, append read requests in message - * to our sm->service_msgs list */ - sm_decode_msg((char*)reqbuf); - ret = (int32_t)UNIFYFS_SUCCESS; - } else { - LOGERR("invalid request command %d from server %d", - reqcmd, src_rank); - ret = (int32_t)UNIFYFS_ERROR_INVAL; - } - break; - } - default: { - LOGERR("invalid request tag %d", tag); - ret = (int32_t)UNIFYFS_ERROR_INVAL; - break; - } - } - -request_out: - - /* fill output structure and return to caller */ - out.ret = ret; - hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); - - /* free margo resources */ - margo_free_input(handle, &in); - if (NULL != reqbuf) { - margo_bulk_free(bulk_handle); - free(reqbuf); - } - margo_destroy(handle); -} -DEFINE_MARGO_RPC_HANDLER(server_request_rpc) - -/* handler for server-server request - * - * decode payload based on tag, and call appropriate svcmgr routine */ +/* handler for server-server chunk read request */ static void chunk_read_request_rpc(hg_handle_t handle) { - int rc, req_id, num_chks; - int src_rank, app_id, client_id; - int32_t ret; - hg_return_t hret; - hg_bulk_t bulk_handle; - size_t bulk_sz; - chunk_read_request_in_t in; - chunk_read_request_out_t out; + int32_t ret = UNIFYFS_SUCCESS; /* get input params */ - rc = margo_get_input(handle, &in); - assert(rc == HG_SUCCESS); - src_rank = (int)in.src_rank; - app_id = (int)in.app_id; - client_id = (int)in.client_id; - req_id = (int)in.req_id; - num_chks = (int)in.num_chks; - bulk_sz = (size_t)in.bulk_size; - - LOGDBG("handling chunk read request from server %d: " - "req=%d num_chunks=%d bulk_sz=%zu", - src_rank, req_id, num_chks, bulk_sz); - - /* get margo info */ - const struct hg_info* hgi = margo_get_info(handle); - assert(NULL != hgi); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - - int reqcmd = (int)SVC_CMD_INVALID; - void* reqbuf = NULL; - if (bulk_sz) { - /* allocate and register local target buffer for bulk access */ - reqbuf = malloc(bulk_sz); - if (NULL != reqbuf) { - hret = margo_bulk_create(mid, 1, &reqbuf, &in.bulk_size, - HG_BULK_WRITE_ONLY, &bulk_handle); - assert(hret == HG_SUCCESS); - - /* pull request data */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_handle, 0, - bulk_handle, 0, in.bulk_size); - assert(hret == HG_SUCCESS); - reqcmd = *(int*)reqbuf; - } - } - /* verify this is a request for data */ - if (reqcmd == (int)SVC_CMD_RDREQ_CHK) { - LOGDBG("request command: SVC_CMD_RDREQ_CHK"); - /* chunk read request */ - sm_issue_chunk_reads(src_rank, app_id, client_id, req_id, - num_chks, (char*)reqbuf); - ret = (int32_t)UNIFYFS_SUCCESS; + chunk_read_request_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; } else { - LOGERR("invalid chunk read request command %d from server %d", - reqcmd, src_rank); - ret = (int32_t)UNIFYFS_ERROR_INVAL; + /* extract params from input struct */ + int src_rank = (int)in.src_rank; + int app_id = (int)in.app_id; + int client_id = (int)in.client_id; + int req_id = (int)in.req_id; + int num_chks = (int)in.num_chks; + size_t bulk_sz = (size_t)in.bulk_size; + + LOGDBG("handling chunk read request from server %d: " + "req=%d num_chunks=%d bulk_sz=%zu", + src_rank, req_id, num_chks, bulk_sz); + + /* get margo info */ + const struct hg_info* hgi = margo_get_info(handle); + assert(NULL != hgi); + + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + hg_bulk_t bulk_handle; + int reqcmd = (int)SVC_CMD_INVALID; + void* reqbuf = NULL; + if (bulk_sz) { + /* allocate and register local target buffer for bulk access */ + reqbuf = malloc(bulk_sz); + if (NULL != reqbuf) { + hret = margo_bulk_create(mid, 1, &reqbuf, &in.bulk_size, + HG_BULK_WRITE_ONLY, &bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* pull request data */ + hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, + in.bulk_handle, 0, + bulk_handle, 0, in.bulk_size); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* first int in request buffer is the command */ + reqcmd = *(int*)reqbuf; + + /* verify this is a request for data */ + if (reqcmd == (int)SVC_CMD_RDREQ_CHK) { + /* chunk read request command */ + LOGDBG("request command: SVC_CMD_RDREQ_CHK"); + ret = sm_issue_chunk_reads(src_rank, + app_id, client_id, + req_id, num_chks, + (char*)reqbuf); + } else { + LOGERR("invalid command %d from server %d", + reqcmd, src_rank); + ret = EINVAL; + } + } + margo_bulk_free(bulk_handle); + } + free(reqbuf); + } else { + ret = ENOMEM; + } + } + margo_free_input(handle, &in); } - /* fill output structure and return to caller */ + /* return output to caller */ + chunk_read_request_out_t out; out.ret = ret; hret = margo_respond(handle, &out); - assert(hret == HG_SUCCESS); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } /* free margo resources */ - margo_free_input(handle, &in); - if (NULL != reqbuf) { - margo_bulk_free(bulk_handle); - free(reqbuf); - } margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(chunk_read_request_rpc) diff --git a/server/src/unifyfs_service_manager.h b/server/src/unifyfs_service_manager.h index 2e055c37c..a2a9c9f2a 100644 --- a/server/src/unifyfs_service_manager.h +++ b/server/src/unifyfs_service_manager.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2017, UT-Battelle, LLC. + * Copyright 2020, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -33,7 +33,7 @@ #include "unifyfs_global.h" /* service manager pthread routine */ -void* sm_service_reads(void* ctx); +void* service_manager_thread(void* ctx); /* initialize and launch service manager */ int svcmgr_init(void); @@ -41,9 +41,6 @@ int svcmgr_init(void); /* join service manager thread and cleanup its state */ int svcmgr_fini(void); -/* process service request message */ -int sm_decode_msg(char* msg_buf); - /* decode and issue chunk reads contained in message buffer */ int sm_issue_chunk_reads(int src_rank, int src_app_id, @@ -53,6 +50,6 @@ int sm_issue_chunk_reads(int src_rank, char* msg_buf); /* MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ -int invoke_chunk_read_response_rpc(remote_chunk_reads_t* rcr); +int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr); #endif // UNIFYFS_SERVICE_MANAGER_H diff --git a/server/src/unifyfs_sock.c b/server/src/unifyfs_sock.c deleted file mode 100644 index 27937fc93..000000000 --- a/server/src/unifyfs_sock.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2017, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arraylist.h" -#include "unifyfs_const.h" -#include "unifyfs_global.h" -#include "unifyfs_keyval.h" -#include "unifyfs_log.h" -#include "unifyfs_sock.h" - -char sock_path[UNIFYFS_MAX_FILENAME]; -int server_sockfd = -1; -int num_fds; -struct pollfd poll_set[MAX_NUM_CLIENTS]; -struct sockaddr_un server_address; - -int detached_sock_idx = -1; -int cur_sock_idx = -1; - -char cmd_buf[MAX_NUM_CLIENTS][CMD_BUF_SIZE]; -char ack_buf[MAX_NUM_CLIENTS][CMD_BUF_SIZE]; -int ack_msg[3] = {0}; - -/* initialize the listening socket on this delegator - * @return success/error code */ -int sock_init_server(int srvr_id) -{ - int i, rc; - - for (i = 0; i < MAX_NUM_CLIENTS; i++) { - poll_set[i].fd = -1; - } - - num_fds = 0; - - snprintf(sock_path, sizeof(sock_path), "%s.%d.%d", - SOCKET_PATH, getuid(), srvr_id); - LOGDBG("domain socket path is %s", sock_path); - unlink(sock_path); // remove domain socket leftover from prior run - - server_sockfd = socket(AF_UNIX, SOCK_STREAM, 0); - - memset(&server_address, 0, sizeof(server_address)); - server_address.sun_family = AF_UNIX; - strcpy(server_address.sun_path, sock_path); - rc = bind(server_sockfd, (struct sockaddr*)&server_address, - (socklen_t)sizeof(server_address)); - if (rc != 0) { - close(server_sockfd); - return -1; - } - - rc = listen(server_sockfd, MAX_NUM_CLIENTS); - if (rc != 0) { - close(server_sockfd); - return -1; - } - - sock_add(server_sockfd); // puts server fd at index 0 of poll_set - LOGDBG("completed sock init server"); - - // publish domain socket path - unifyfs_keyval_publish_local(key_unifyfsd_socket, sock_path); - - return 0; -} - -void sock_sanitize_client(int client_idx) -{ - /* close socket for this client id - * and set fd back to -1 */ - if (poll_set[client_idx].fd != -1) { - close(poll_set[client_idx].fd); - poll_set[client_idx].fd = -1; - } -} - -int sock_sanitize(void) -{ - int i; - for (i = 0; i < num_fds; i++) { - sock_sanitize_client(i); - } - - if (server_sockfd != -1) { - server_sockfd = -1; - unlink(sock_path); - } - - return 0; -} - -int sock_add(int fd) -{ - if (num_fds == MAX_NUM_CLIENTS) { - LOGERR("exceeded MAX_NUM_CLIENTS"); - return -1; - } - - int flag = fcntl(fd, F_GETFL); - fcntl(fd, F_SETFL, flag | O_NONBLOCK); - - LOGDBG("sock_adding fd: %d", fd); - poll_set[num_fds].fd = fd; - poll_set[num_fds].events = POLLIN | POLLHUP; - poll_set[num_fds].revents = 0; - num_fds++; - return 0; -} - -void sock_reset(void) -{ - int i; - cur_sock_idx = -1; - detached_sock_idx = -1; - for (i = 0; i < num_fds; i++) { - poll_set[i].events = POLLIN | POLLHUP; - poll_set[i].revents = 0; - } -} - -int sock_remove(int client_idx) -{ - /* in this case, we simply disable the disconnected - * file descriptor. */ - poll_set[client_idx].fd = -1; - return 0; -} - -/* - * wait for the client-side command - * */ - -int sock_wait_cmd(int poll_timeout) -{ - int rc, i, client_fd; - - sock_reset(); - rc = poll(poll_set, num_fds, poll_timeout); - if (rc < 0) { - return (int)UNIFYFS_ERROR_POLL; - } else if (rc == 0) { // timeout - return (int)UNIFYFS_SUCCESS; - } else { - LOGDBG("poll detected socket activity"); - for (i = 0; i < num_fds; i++) { - if (poll_set[i].fd == -1) { - continue; - } - if (i == 0) { // listening socket - if (poll_set[i].revents & POLLIN) { - int client_len = sizeof(struct sockaddr_un); - struct sockaddr_un client_address; - client_fd = accept(server_sockfd, - (struct sockaddr*)&client_address, - (socklen_t*)&client_len); - LOGDBG("accepted client on socket %d", client_fd); - rc = sock_add(client_fd); - if (rc < 0) { - return (int)UNIFYFS_ERROR_SOCKET_FD_EXCEED; - } - } else if (poll_set[i].revents & POLLERR) { - // unknown error on listening socket - return (int)UNIFYFS_ERROR_SOCK_LISTEN; - } - } else { // (i != 0) client sockets - rc = 0; - if (poll_set[i].revents & POLLIN) { - ssize_t bytes_read = read(poll_set[i].fd, - cmd_buf[i], CMD_BUF_SIZE); - if (bytes_read == 0) { - rc = (int)UNIFYFS_ERROR_SOCK_DISCONNECT; - } else { // read a client command - cur_sock_idx = i; - return UNIFYFS_SUCCESS; - } - } else if (poll_set[i].revents & POLLHUP) { - rc = (int)UNIFYFS_ERROR_SOCK_DISCONNECT; - } else if (poll_set[i].revents & POLLERR) { - // unknown error on client socket - rc = (int)UNIFYFS_ERROR_SOCK_OTHER; - } - if (rc) { - if (rc == (int)UNIFYFS_ERROR_SOCK_DISCONNECT) { - sock_remove(i); - detached_sock_idx = i; - } - return rc; - } - } - } - } - - return UNIFYFS_SUCCESS; - -} - -#if 0 // DEPRECATED DUE TO MARGO -/* - * send command to the client to let the client digest the - * data in the shared receive buffer - * @param: sock_id: socket index in poll_set - * @param: cmd: command type - * - * */ -int sock_notify_client(int client_idx, int cmd) -{ - LOGDBG("sock notifying fd: %d", client_sockfd); - - memset(ack_buf[client_idx], 0, sizeof(ack_buf[client_idx])); - memcpy(ack_buf[client_idx], &cmd, sizeof(int)); - - ssize_t rc = write(client_sockfd, ack_buf[client_idx], - sizeof(ack_buf[client_idx])); - if (rc < 0) { - return (int)UNIFYFS_ERROR_SOCK_OTHER; - } - return UNIFYFS_SUCCESS; -} - -int sock_ack_client(int client_idx, int ret_sz) -{ - ssize_t rc = write(poll_set[client_idx].fd, ack_buf[client_idx], ret_sz); - if (rc < 0) { - return (int)UNIFYFS_ERROR_SOCK_OTHER; - } - return UNIFYFS_SUCCESS; -} - -int sock_handle_error(int sock_error_no) -{ - return UNIFYFS_SUCCESS; -} - -char* sock_get_cmd_buf(int client_idx) -{ - return (char*) cmd_buf[client_idx]; -} - -char* sock_get_ack_buf(int client_idx) -{ - return (char*) ack_buf[client_idx]; -} - -int sock_get_id(void) -{ - return cur_sock_idx; -} - -int sock_get_error_id(void) -{ - return detached_sock_idx; -} -#endif // DEPRECATED DUE TO MARGO diff --git a/server/src/unifyfs_sock.h b/server/src/unifyfs_sock.h deleted file mode 100644 index 794f02e9f..000000000 --- a/server/src/unifyfs_sock.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2017, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -#ifndef UNIFYFS_SOCK_H -#define UNIFYFS_SOCK_H - -#include -#include "unifyfs_const.h" - -extern int server_sockfd; -extern int client_sockfd; -extern struct pollfd poll_set[MAX_NUM_CLIENTS]; - -int sock_init_server(int srvr_id); -void sock_sanitize_client(int client_idx); -int sock_sanitize(void); -int sock_add(int fd); -int sock_remove(int client_idx); -void sock_reset(void); -int sock_wait_cmd(int poll_timeout); - -#if 0 // DEPRECATED DUE TO MARGO -int sock_handle_error(int sock_error_no); -int sock_get_id(void); -int sock_get_error_id(void); -char* sock_get_cmd_buf(int client_idx); -char* sock_get_ack_buf(int client_idx); -int sock_ack_client(int client_idx, int ret_sz); -int sock_notify_client(int client_idx, int cmd); -#endif // DEPRECATED DUE TO MARGO - -#endif diff --git a/server/src/unifyfs_tree.c b/server/src/unifyfs_tree.c new file mode 100644 index 000000000..5fa3b3c4f --- /dev/null +++ b/server/src/unifyfs_tree.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include + +#include "unifyfs_tree.h" + +/** + * @brief given the process's rank and the number of ranks, this computes a + * k-ary tree rooted at rank 0, the structure records the number of children of + * the local rank and the list of their ranks + * + * @param rank rank of calling process + * @param ranks number of ranks in tree + * @param root rank of root of tree + * @param k degree of k-ary tree + * @param t output tree structure + */ +int unifyfs_tree_init( + int rank, /* rank of calling process */ + int ranks, /* number of ranks in tree */ + int root, /* rank of root of tree */ + int k, /* degree of k-ary tree */ + unifyfs_tree_t* t) /* output tree structure */ +{ + int i; + + /* compute distance from our rank to root, + * rotate ranks to put root as rank 0 */ + rank -= root; + if (rank < 0) { + rank += ranks; + } + + /* compute parent and child ranks with root as rank 0 */ + + /* initialize fields */ + t->rank = rank; + t->ranks = ranks; + t->parent_rank = -1; + t->child_count = 0; + t->child_ranks = NULL; + + /* compute the maximum number of children this task may have */ + int max_children = k; + + /* allocate memory to hold list of children ranks */ + if (max_children > 0) { + size_t bytes = (size_t)max_children * sizeof(int); + t->child_ranks = (int*) malloc(bytes); + + if (t->child_ranks == NULL) { + //LOGERR("Failed to allocate memory for child rank array"); + return ENOMEM; + } + } + + /* initialize all ranks to NULL */ + for (i = 0; i < max_children; i++) { + t->child_ranks[i] = -1; + } + + /* compute rank of our parent if we have one */ + if (rank > 0) { + t->parent_rank = (rank - 1) / k; + } + + /* identify ranks of what would be leftmost + * and rightmost children */ + int left = rank * k + 1; + int right = rank * k + k; + + /* if we have at least one child, + * compute number of children and list of child ranks */ + if (left < ranks) { + /* adjust right child in case we don't have a full set of k */ + if (right >= ranks) { + right = ranks - 1; + } + + /* compute number of children */ + t->child_count = right - left + 1; + + /* fill in rank for each child */ + for (i = 0; i < t->child_count; i++) { + t->child_ranks[i] = left + i; + } + } + + /* rotate tree neighbor ranks to use global ranks */ + + /* rotate our rank in tree */ + t->rank += root; + if (t->rank >= ranks) { + t->rank -= ranks; + } + + /* rotate rank of our parent in tree if we have one */ + if (t->parent_rank != -1) { + t->parent_rank += root; + if (t->parent_rank >= ranks) { + t->parent_rank -= ranks; + } + } + + /* rotate rank of each child in tree */ + for (i = 0; i < t->child_count; i++) { + t->child_ranks[i] += root; + if (t->child_ranks[i] >= ranks) { + t->child_ranks[i] -= ranks; + } + } + + return 0; +} + +void unifyfs_tree_free(unifyfs_tree_t* t) +{ + /* free child rank list */ + free(t->child_ranks); + t->child_ranks = NULL; + + return; +} + diff --git a/server/src/unifyfs_tree.h b/server/src/unifyfs_tree.h new file mode 100644 index 000000000..27c474735 --- /dev/null +++ b/server/src/unifyfs_tree.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_TREE_H +#define UNIFYFS_TREE_H + +#include +#include "unifyfs_meta.h" + +/* define tree structure */ +typedef struct { + int rank; /* global rank of calling process */ + int ranks; /* number of ranks in tree */ + int parent_rank; /* parent rank, -1 if root */ + int child_count; /* number of children */ + int* child_ranks; /* list of child ranks */ +} unifyfs_tree_t; + +/* given the process's rank and the number of ranks, this computes a k-ary + * tree rooted at rank 0, the structure records the number of children + * of the local rank and the list of their ranks */ +int unifyfs_tree_init( + int rank, /* rank of calling process */ + int ranks, /* number of ranks in tree */ + int root, /* rank of root process */ + int k, /* degree of k-ary tree */ + unifyfs_tree_t* t /* output tree structure */ +); + +/* free resources allocated in unifyfs_tree_init */ +void unifyfs_tree_free(unifyfs_tree_t* t); + +#endif /* UNIFYFS_TREE_H */ diff --git a/t/0001-setup.t b/t/0001-setup.t index e9fd500a0..4cf3042a3 100755 --- a/t/0001-setup.t +++ b/t/0001-setup.t @@ -14,8 +14,13 @@ echo 1..1 # common metadata directory across multiple tests. Save the value to a # script in known location that later test scripts can source. # -export UNIFYFS_MOUNT_POINT=$(mktemp -d) -export UNIFYFS_META_DB_PATH=$(mktemp -d) +export UNIFYFS_TEST_TMPDIR=$(mktemp -d) +mkdir -p $UNIFYFS_TEST_TMPDIR/{meta,mount,share,spill,state} +export UNIFYFS_TEST_META=$UNIFYFS_TEST_TMPDIR/meta +export UNIFYFS_TEST_MOUNT=$UNIFYFS_TEST_TMPDIR/mount +export UNIFYFS_TEST_SHARE=$UNIFYFS_TEST_TMPDIR/share +export UNIFYFS_TEST_SPILL=$UNIFYFS_TEST_TMPDIR/spill +export UNIFYFS_TEST_STATE=$UNIFYFS_TEST_TMPDIR/state # # Source test environment first to pick up UNIFYFS_TEST_RUN_SCRIPT @@ -23,25 +28,30 @@ export UNIFYFS_META_DB_PATH=$(mktemp -d) . $(dirname $0)/sharness.d/00-test-env.sh cat >"$UNIFYFS_TEST_RUN_SCRIPT" <<-EOF -export UNIFYFS_MOUNT_POINT=$UNIFYFS_MOUNT_POINT -export UNIFYFS_META_DB_PATH=$UNIFYFS_META_DB_PATH +export UNIFYFS_TEST_TMPDIR=$UNIFYFS_TEST_TMPDIR +export UNIFYFS_TEST_META=$UNIFYFS_TEST_META +export UNIFYFS_TEST_MOUNT=$UNIFYFS_TEST_MOUNT +export UNIFYFS_TEST_SHARE=$UNIFYFS_TEST_SHARE +export UNIFYFS_TEST_SPILL=$UNIFYFS_TEST_SPILL +export UNIFYFS_TEST_STATE=$UNIFYFS_TEST_STATE +export UNIFYFS_LOG_VERBOSITY=5 EOF . $(dirname $0)/sharness.d/01-unifyfs-settings.sh . $(dirname $0)/sharness.d/02-functions.sh # -# Start the UnifyFS daemon after killing and cleanup up after any previously +# Start the UnifyFS daemon after killing any previously # running instance. # unifyfsd_stop_daemon -unifyfsd_cleanup unifyfsd_start_daemon # # Make sure the unifyfsd process starts. # if ! process_is_running unifyfsd 5 ; then + cat $UNIFYFS_LOG_DIR/${UNIFYFS_LOG_FILE}* >&3 echo not ok 1 - unifyfsd started exit 1 fi @@ -51,18 +61,10 @@ fi # it dies during initialization. # if process_is_not_running unifyfsd 5; then + cat $UNIFYFS_LOG_DIR/${UNIFYFS_LOG_FILE}* >&3 echo not ok 1 - unifyfsd running exit 1 fi -# -# Make sure unifyfsd successfully generated client runstate file -# -uid=$(id -u) -if ! test -f $UNIFYFS_META_DB_PATH/unifyfs-runstate.conf.$uid ; then - echo not ok 1 - unifyfsd runstate - exit 1 -fi - echo ok 1 - unifyfsd running exit 0 diff --git a/t/0100-sysio-gotcha.t b/t/0100-sysio-gotcha.t index 9a2795f5c..762121fab 100755 --- a/t/0100-sysio-gotcha.t +++ b/t/0100-sysio-gotcha.t @@ -5,4 +5,4 @@ # . $(dirname $0)/sharness.d/00-test-env.sh . $(dirname $0)/sharness.d/01-unifyfs-settings.sh -$UNIFYFS_BUILD_DIR/t/sys/sysio-gotcha.t +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/sys/sysio-gotcha.t diff --git a/t/0200-stdio-gotcha.t b/t/0200-stdio-gotcha.t index df7cad09f..b1fb030f3 100755 --- a/t/0200-stdio-gotcha.t +++ b/t/0200-stdio-gotcha.t @@ -5,4 +5,4 @@ # . $(dirname $0)/sharness.d/00-test-env.sh . $(dirname $0)/sharness.d/01-unifyfs-settings.sh -$UNIFYFS_BUILD_DIR/t/std/stdio-gotcha.t +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/std/stdio-gotcha.t diff --git a/t/0500-sysio-static.t b/t/0500-sysio-static.t index 3772ab8ac..b5a422a26 100755 --- a/t/0500-sysio-static.t +++ b/t/0500-sysio-static.t @@ -5,4 +5,4 @@ # . $(dirname $0)/sharness.d/00-test-env.sh . $(dirname $0)/sharness.d/01-unifyfs-settings.sh -$UNIFYFS_BUILD_DIR/t/sys/sysio-static.t +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/sys/sysio-static.t diff --git a/t/0600-stdio-static.t b/t/0600-stdio-static.t index e1322b635..83ff853c1 100755 --- a/t/0600-stdio-static.t +++ b/t/0600-stdio-static.t @@ -5,4 +5,4 @@ # . $(dirname $0)/sharness.d/00-test-env.sh . $(dirname $0)/sharness.d/01-unifyfs-settings.sh -$UNIFYFS_BUILD_DIR/t/std/stdio-static.t +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/std/stdio-static.t diff --git a/t/0700-unifyfs-stage-full.t b/t/0700-unifyfs-stage-full.t new file mode 100755 index 000000000..de8540ba6 --- /dev/null +++ b/t/0700-unifyfs-stage-full.t @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Test unifyfs-stage executable for basic functionality +# + +test_description="Test basic functionality of unifyfs-stage executable" + +. $(dirname $0)/sharness.sh + +test_expect_success "unifyfs-stage exists" ' + test_path_is_file ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage +' +test_expect_success "testing temp dir exists" ' + test_path_is_dir ${UNIFYFS_TEST_TMPDIR} +' + +mkdir -p ${UNIFYFS_TEST_TMPDIR}/config_0700 +mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_source +mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700 + +test_expect_success "stage testing dirs exist" ' + test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/config_0700 + test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_source + test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700 +' + +dd if=/dev/urandom bs=4M count=1 of=${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file + +test_expect_success "source.file exists" ' + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file +' + +rm -f ${UNIFYFS_TEST_TMPDIR}/config_0700/* +rm -f ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/* + +test_expect_success "config_0700 directory is empty" ' + test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/config_0700 +' + +echo "\"${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file\" \"${UNIFYFS_TEST_MOUNT}/intermediate.file\"" > ${UNIFYFS_TEST_TMPDIR}/config_0700/test_IN.manifest +echo "\"${UNIFYFS_TEST_MOUNT}/intermediate.file\" \"${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/destination_0700.file\"" > ${UNIFYFS_TEST_TMPDIR}/config_0700/test_OUT.manifest + +test_expect_success "config_0700 directory now has manifest files" ' + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/config_0700/test_IN.manifest + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/config_0700/test_OUT.manifest +' + +test_expect_success "target directory is empty" ' + test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700 +' + +$JOB_RUN_COMMAND ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -m ${UNIFYFS_TEST_MOUNT} ${UNIFYFS_TEST_TMPDIR}/config_0700/test_IN.manifest > ${UNIFYFS_TEST_TMPDIR}/config_0700/stage_IN_output.OUT 2>&1 + +$JOB_RUN_COMMAND ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -m ${UNIFYFS_TEST_MOUNT} ${UNIFYFS_TEST_TMPDIR}/config_0700/test_OUT.manifest > ${UNIFYFS_TEST_TMPDIR}/config_0700/stage_OUT_output.OUT 2>&1 + +test_expect_success "input file has been staged to output" ' + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/destination_0700.file +' + +test_expect_success "final output is identical to initial input" ' + test_might_fail test_cmp ${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/destination_0700.file +' + +test_done diff --git a/t/9005-unifyfs-unmount.t b/t/9005-unifyfs-unmount.t index 31e5bdd7f..25d673752 100755 --- a/t/9005-unifyfs-unmount.t +++ b/t/9005-unifyfs-unmount.t @@ -5,4 +5,4 @@ # . $(dirname $0)/sharness.d/00-test-env.sh . $(dirname $0)/sharness.d/01-unifyfs-settings.sh -$UNIFYFS_BUILD_DIR/t/unifyfs_unmount.t +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/unifyfs_unmount.t diff --git a/t/9020-mountpoint-empty.t b/t/9020-mountpoint-empty.t index 54c20b59e..82141dd9d 100755 --- a/t/9020-mountpoint-empty.t +++ b/t/9020-mountpoint-empty.t @@ -11,8 +11,8 @@ test_description="Verify UnifyFS intercepted mount point is empty" . $(dirname $0)/sharness.sh -test_expect_success "Intercepted mount point $UNIFYFS_MOUNT_POINT is empty" ' - test_dir_is_empty $UNIFYFS_MOUNT_POINT +test_expect_success "Intercepted mount point $UNIFYFS_MOUNTPOINT is empty" ' + test_dir_is_empty $UNIFYFS_MOUNTPOINT ' test_done diff --git a/t/9200-seg-tree-test.t b/t/9200-seg-tree-test.t new file mode 100755 index 000000000..f062b833c --- /dev/null +++ b/t/9200-seg-tree-test.t @@ -0,0 +1,8 @@ +#!/bin/bash +# +# Source sharness environment scripts to pick up test environment +# and UnifyFS runtime settings. +# +. $(dirname $0)/sharness.d/00-test-env.sh +. $(dirname $0)/sharness.d/01-unifyfs-settings.sh +$UNIFYFS_BUILD_DIR/t/common/seg_tree_test.t diff --git a/t/9100-metadata-api.t b/t/9201-slotmap-test.t similarity index 62% rename from t/9100-metadata-api.t rename to t/9201-slotmap-test.t index 7ae09a671..0ff8efeec 100755 --- a/t/9100-metadata-api.t +++ b/t/9201-slotmap-test.t @@ -1,10 +1,8 @@ #!/bin/bash - -test_description="Test Metadata API" - # # Source sharness environment scripts to pick up test environment # and UnifyFS runtime settings. # . $(dirname $0)/sharness.d/00-test-env.sh -$UNIFYFS_BUILD_DIR/t/server/metadata.t +. $(dirname $0)/sharness.d/01-unifyfs-settings.sh +$UNIFYFS_BUILD_DIR/t/common/slotmap_test.t diff --git a/t/9300-unifyfs-stage-isolated.t b/t/9300-unifyfs-stage-isolated.t new file mode 100755 index 000000000..007853999 --- /dev/null +++ b/t/9300-unifyfs-stage-isolated.t @@ -0,0 +1,62 @@ +#!/bin/bash +# +# Test unifyfs-stage executable for basic functionality +# + +test_description="Test basic functionality of unifyfs-stage executable" + +. $(dirname $0)/sharness.sh + +test_expect_success "unifyfs-stage exists" ' + test_path_is_file ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage +' +test_expect_success "testing temp dir exists" ' + test_path_is_dir ${UNIFYFS_TEST_TMPDIR} +' + +mkdir -p ${UNIFYFS_TEST_TMPDIR}/config_9300 +mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_source +mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300 + +test_expect_success "stage testing dirs exist" ' + test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/config_9300 + test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_source + test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300 +' + +# NOTE: we're using the unifyfs-stage binary as its own transfer data target +# because we know it's there and it's filled with non-zero data. +cp ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage ${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file + +test_expect_success "source.file exists" ' + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file +' + +rm -f ${UNIFYFS_TEST_TMPDIR}/config_9300/* +rm -f ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/* + +test_expect_success "config_9300 directory is empty" ' + test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/config_9300 +' + +echo "\"${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file\" \"${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/destination_9300.file\"" > ${UNIFYFS_TEST_TMPDIR}/config_9300/test_INOUT.manifest + +test_expect_success "config_9300 directory now has manifest files" ' + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/config_9300/test_INOUT.manifest +' + +test_expect_success "target directory is empty" ' + test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300 +' + +$JOB_RUN_COMMAND ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -N ${UNIFYFS_TEST_TMPDIR}/config_9300/test_INOUT.manifest > ${UNIFYFS_TEST_TMPDIR}/config_9300/stage_INOUT_output.OUT 2>&1 + +test_expect_success "input file has been staged to output" ' + test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/destination_9300.file +' + +test_expect_success "final output is identical to initial input" ' + test_cmp ${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/destination_9300.file +' + +test_done diff --git a/t/9999-cleanup.t b/t/9999-cleanup.t new file mode 100755 index 000000000..a72e1dd24 --- /dev/null +++ b/t/9999-cleanup.t @@ -0,0 +1,11 @@ +#!/bin/bash + +test_description="Cleanup test environment" + +. $(dirname $0)/sharness.sh + +test_expect_success "Cleanup" ' + unifyfsd_cleanup +' + +test_done diff --git a/t/Makefile.am b/t/Makefile.am index 8394d9e74..abfe4491e 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -9,10 +9,14 @@ TESTS = \ 0200-stdio-gotcha.t \ 0500-sysio-static.t \ 0600-stdio-static.t \ + 0700-unifyfs-stage-full.t \ 9005-unifyfs-unmount.t \ 9010-stop-unifyfsd.t \ 9020-mountpoint-empty.t \ - 9100-metadata-api.t + 9200-seg-tree-test.t \ + 9201-slotmap-test.t \ + 9300-unifyfs-stage-isolated.t \ + 9999-cleanup.t check_SCRIPTS = \ 0001-setup.t \ @@ -20,10 +24,14 @@ check_SCRIPTS = \ 0200-stdio-gotcha.t \ 0500-sysio-static.t \ 0600-stdio-static.t \ + 0700-unifyfs-stage-full.t \ 9005-unifyfs-unmount.t \ 9010-stop-unifyfsd.t \ 9020-mountpoint-empty.t \ - 9100-metadata-api.t + 9200-seg-tree-test.t \ + 9201-slotmap-test.t \ + 9300-unifyfs-stage-isolated.t \ + 9999-cleanup.t EXTRA_DIST = \ $(check_SCRIPTS) \ @@ -31,24 +39,34 @@ EXTRA_DIST = \ sharness.sh \ tap-driver.sh -AM_CFLAGS = -Wall +AM_CFLAGS = -Wall -Werror clean-local: rm -fr trash-directory.* test-results *.log test_run_env.sh libexec_PROGRAMS = \ - sys/sysio-gotcha.t \ + common/seg_tree_test.t \ + common/slotmap_test.t \ std/stdio-gotcha.t \ - sys/sysio-static.t \ std/stdio-static.t \ - server/metadata.t \ + sys/sysio-gotcha.t \ + sys/sysio-static.t \ unifyfs_unmount.t + +test_common_ldadd = \ + $(top_builddir)/t/lib/libtap.la \ + $(top_builddir)/t/lib/libtestutil.la \ + $(top_builddir)/common/src/libunifyfs_common.la + +test_common_ldflags = \ + -static $(AM_LDFLAGS) + test_ldadd = \ $(top_builddir)/t/lib/libtap.la \ $(top_builddir)/t/lib/libtestutil.la \ $(top_builddir)/client/src/libunifyfs_gotcha.la \ - $(MPI_CLDFLAGS) $(FLATCC_LDFLAGS) $(FLATCC_LIBS) + $(MPI_CLDFLAGS) test_static_ldadd = \ $(top_builddir)/t/lib/libtap.la \ @@ -56,32 +74,15 @@ test_static_ldadd = \ $(top_builddir)/client/src/libunifyfs.la test_static_ldflags = \ - -static \ - $(CP_WRAPPERS) $(AM_LDFLAGS) $(MPI_CLDFLAGS) $(FLATCC_LDFLAGS) \ - $(FLATCC_LIBS) + -static $(AM_LDFLAGS) \ + $(CP_WRAPPERS) \ + $(MPI_CLDFLAGS) -test_metadata_ldadd = \ - $(top_builddir)/t/lib/libtap.la \ - $(top_builddir)/t/lib/libtestutil.la \ - $(top_builddir)/common/src/libunifyfs_common.la \ - $(top_builddir)/server/src/libunifyfsd.a \ - $(top_builddir)/meta/src/libmdhim.a \ - $(LEVELDB_LDFLAGS) $(LEVELDB_LIBS) \ - $(MPI_CLDFLAGS) \ - $(MERCURY_LDFLAGS) $(MERCURY_LIBS) \ - $(ARGOBOTS_LDFLAGS) $(ARGOBOTS_LIBS) \ - $(MARGO_LDFLAGS) $(MARGO_LIBS) \ - $(FLATCC_LDFLAGS) $(FLATCC_LIBS) \ - -lpthread -lm -lstdc++ -lrt - -test_meta_cppflags = \ +test_common_cppflags = \ -I$(top_srcdir) \ - -I$(top_srcdir)/server/src \ -I$(top_srcdir)/common/src \ -D_GNU_SOURCE \ - $(AM_CPPFLAGS) \ - $(MARGO_CFLAGS) \ - $(MPI_CFLAGS) + $(AM_CPPFLAGS) test_cppflags = \ -I$(top_srcdir) \ @@ -91,13 +92,21 @@ test_cppflags = \ $(AM_CPPFLAGS) \ $(MPI_CFLAGS) + sys_sysio_gotcha_t_SOURCES = sys/sysio_suite.h \ sys/sysio_suite.c \ sys/creat-close.c \ sys/creat64.c \ sys/mkdir-rmdir.c \ sys/open.c \ - sys/open64.c + sys/open64.c \ + sys/lseek.c \ + sys/write-read.c \ + sys/write-read-hole.c \ + sys/truncate.c \ + sys/unlink.c \ + sys/chdir.c + sys_sysio_gotcha_t_CPPFLAGS = $(test_cppflags) sys_sysio_gotcha_t_LDADD = $(test_ldadd) sys_sysio_gotcha_t_LDFLAGS = $(AM_LDFLAGS) @@ -108,35 +117,53 @@ sys_sysio_static_t_SOURCES = sys/sysio_suite.h \ sys/creat64.c \ sys/mkdir-rmdir.c \ sys/open.c \ - sys/open64.c + sys/open64.c \ + sys/lseek.c \ + sys/write-read.c \ + sys/write-read-hole.c \ + sys/truncate.c \ + sys/unlink.c \ + sys/chdir.c + sys_sysio_static_t_CPPFLAGS = $(test_cppflags) sys_sysio_static_t_LDADD = $(test_static_ldadd) sys_sysio_static_t_LDFLAGS = $(test_static_ldflags) std_stdio_gotcha_t_SOURCES = std/stdio_suite.h \ std/stdio_suite.c \ - std/fopen-fclose.c + std/fopen-fclose.c \ + std/fseek-ftell.c \ + std/fwrite-fread.c \ + std/fflush.c \ + std/size.c + std_stdio_gotcha_t_CPPFLAGS = $(test_cppflags) std_stdio_gotcha_t_LDADD = $(test_ldadd) std_stdio_gotcha_t_LDFLAGS = $(AM_LDFLAGS) std_stdio_static_t_SOURCES = std/stdio_suite.h \ std/stdio_suite.c \ - std/fopen-fclose.c + std/fopen-fclose.c \ + std/fseek-ftell.c \ + std/fwrite-fread.c \ + std/fflush.c \ + std/size.c + std_stdio_static_t_CPPFLAGS = $(test_cppflags) std_stdio_static_t_LDADD = $(test_static_ldadd) std_stdio_static_t_LDFLAGS = $(test_static_ldflags) -server_metadata_t_SOURCES = \ - server/metadata_suite.h \ - server/metadata_suite.c \ - server/unifyfs_meta_get_test.c - -server_metadata_t_CPPFLAGS = $(test_meta_cppflags) -server_metadata_t_LDADD = $(test_metadata_ldadd) -server_metadata_t_LDFLAGS = $(AM_LDFLAGS) - unifyfs_unmount_t_SOURCES = unifyfs_unmount.c unifyfs_unmount_t_CPPFLAGS = $(test_cppflags) -unifyfs_unmount_t_LDADD = $(test_ldadd) -unifyfs_unmount_t_LDFLAGS = $(AM_LDFLAGS) +unifyfs_unmount_t_LDADD = $(test_static_ldadd) +unifyfs_unmount_t_LDFLAGS = $(test_static_ldflags) + +common_seg_tree_test_t_SOURCES = common/seg_tree_test.c +common_seg_tree_test_t_CPPFLAGS = $(test_common_cppflags) +common_seg_tree_test_t_LDADD = $(test_common_ldadd) +common_seg_tree_test_t_LDFLAGS = $(test_common_ldflags) + +common_slotmap_test_t_SOURCES = common/slotmap_test.c +common_slotmap_test_t_CPPFLAGS = $(test_common_cppflags) +common_slotmap_test_t_LDADD = $(test_common_ldadd) +common_slotmap_test_t_LDFLAGS = $(test_common_ldflags) diff --git a/t/ci/001-setup.sh b/t/ci/001-setup.sh index bdead8817..d4553bbbe 100755 --- a/t/ci/001-setup.sh +++ b/t/ci/001-setup.sh @@ -7,13 +7,13 @@ # desired. To run all tests simply run the RUN_TESTS.sh script. If Individual # tests are desired to be run, source the 001-setup.sh script first, followed by # 002-start-server.sh. Then source each desired script after that preceded by -# `$CI_DIR`. When finished, source the 990-stop-server.sh script last. +# `$UNIFYFS_CI_DIR`. When finished, source the 990-stop-server.sh script last. # # E.g.: # $ . full/path/to/001-setup.sh -# $ . $CI_DIR/002-start-server.sh -# $ . $CI_DIR/100-writeread-tests.sh -# $ . $CI_DIR/990-stop-server.sh +# $ . $UNIFYFS_CI_DIR/002-start-server.sh +# $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh +# $ . $UNIFYFS_CI_DIR/990-stop-server.sh # # To run all of the tests, simply run RUN_CI_TESTS.sh # @@ -36,9 +36,9 @@ Then source any desired test files. Lastly, source 990-stop-server.sh. E.g.: $ . full/path/to/001-setup.sh - $ . $CI_DIR/002-start-server.sh - $ . $CI_DIR/100-writeread-tests.sh - $ . $CI_DIR/990-stop-server.sh + $ . \$UNIFYFS_CI_DIR/002-start-server.sh + $ . \$UNIFYFS_CI_DIR/100-writeread-tests.sh + $ . \$UNIFYFS_CI_DIR/990-stop-server.sh To run all of the tests, simply run RUN_CI_TESTS.sh. @@ -72,7 +72,6 @@ done [[ -z $infomsg ]] && infomsg="-- UNIFYFS JOB INFO:" [[ -z $errmsg ]] && errmsg="!!!! UNIFYFS JOB ERROR:" -export CI_PROJDIR=${CI_PROJDIR:-$HOME} export TMPDIR=${TMPDIR:-/tmp} export SYSTEM_NAME=$(echo $(hostname) | sed -r 's/(^[[:alpha:]]*)(.*)/\1/') @@ -81,13 +80,19 @@ export SYSTEM_NAME=$(echo $(hostname) | sed -r 's/(^[[:alpha:]]*)(.*)/\1/') # Set up sharness variables and functions for TAP testing. echo "$infomsg Setting up sharness" -CI_DIR=${CI_DIR:-$(dirname "$(readlink -fm $BASH_SOURCE)")} -SHARNESS_DIR="$(dirname "$CI_DIR")" -echo "$infomsg CI_DIR: $CI_DIR" +UNIFYFS_CI_DIR=${UNIFYFS_CI_DIR:-$(dirname "$(readlink -fm $BASH_SOURCE)")} +SHARNESS_DIR="$(dirname "$UNIFYFS_CI_DIR")" +UNIFYFS_SOURCE_DIR="$(dirname "$SHARNESS_DIR")" +BASE_SEARCH_DIR=${BASE_SEARCH_DIR:-"$(dirname "$UNIFYFS_SOURCE_DIR")"} +echo "$infomsg UNIFYFS_CI_DIR: $UNIFYFS_CI_DIR" echo "$infomsg SHARNESS_DIR: $SHARNESS_DIR" +echo "$infomsg UNIFYFS_SOURCE_DIR: $UNIFYFS_SOURCE_DIR" +echo "$infomsg BASE_SEARCH_DIR: $BASE_SEARCH_DIR" + +SHARNESS_TEST_DIRECTORY=${SHARNESS_TEST_DIRECTORY:-$UNIFYFS_CI_DIR} source ${SHARNESS_DIR}/sharness.sh source $SHARNESS_DIR/sharness.d/02-functions.sh -source $CI_DIR/ci-functions.sh +source $UNIFYFS_CI_DIR/ci-functions.sh ########## Locate UnifyFS install and examples ########## @@ -101,22 +106,21 @@ echo "$infomsg Looking for UnifyFS install directory..." # Look for UnifyFS install directory if the user didn't already set # $UNIFYFS_INSTALL to the directory containing bin/ and libexec/ if [[ -z $UNIFYFS_INSTALL ]]; then - # Check for $SPACK_ROOT and if unifyfs is installed - if [[ -n $SPACK_ROOT && -d $(spack location -i unifyfs 2>/dev/null) ]]; + # Search for unifyfsd starting in $BASE_SEARCH_DIR and omitting SPACK_ROOT + unifyfsd_exe="$(find_executable $BASE_SEARCH_DIR "*/bin/unifyfsd"\ + $SPACK_ROOT)" + if [[ -x $unifyfsd_exe ]]; then + # Set UNIFYFS_INSTALL to the dir containing bin/ and libexec/ + UNIFYFS_INSTALL="$(dirname "$(dirname "$unifyfsd_exe")")" + # Else check for $SPACK_ROOT and if unifyfs is installed + elif [[ -n $SPACK_ROOT && -d $(spack location -i unifyfs 2>/dev/null) ]]; then # Might have a problem with variants and arch UNIFYFS_INSTALL="$(spack location -i unifyfs)" - # Else search for unifyfsd starting in $CI_PROJDIR and omitting spack_root - elif [[ -x $(find_executable $CI_PROJDIR "*/bin/unifyfsd" $SPACK_ROOT) ]]; - then - # Set UNIFYFS_INSTALL to the dir containing bin/ and libexec/ - UNIFYFS_INSTALL="$(dirname "$(dirname \ - "$(find_executable $CI_PROJDIR "*/bin/unifyfsd" $SPACK_ROOT)")")" else echo >&2 "$errmsg Unable to find UnifyFS install directory" - echo >&2 "$errmsg \`spack install unifyfs\`, set the" \ - "\$UNIFYFS_INSTALL envar to the directory containing bin/" \ - "and libexec/, or manually install to \$CI_PROJDIR/*" + echo >&2 "$errmsg Set \$UNIFYFS_INSTALL to the directory containing" \ + "bin/ and libexec/ or \`spack install unifyfs\`" exit 1 fi fi @@ -133,6 +137,7 @@ if [[ -d $UNIFYFS_INSTALL && -d ${UNIFYFS_INSTALL}/bin && else echo >&2 "$errmsg Ensure \$UNIFYFS_INSTALL exists and is the directory" \ "containing bin/ and libexec/" + exit 1 fi # Check for necessary Spack modules if Spack is detected @@ -140,7 +145,7 @@ fi # don't fail out if [[ -n $(which spack 2>/dev/null) ]]; then loaded_modules=$(module list 2>&1) - modules="gotcha leveldb flatcc argobots mercury margo" + modules="gotcha argobots mercury margo spath" for mod in $modules; do if ! [[ $(echo "$loaded_modules" | fgrep "$mod") ]]; then echo "$errmsg $mod not detected. Please 'spack load $mod'" @@ -155,9 +160,9 @@ fi # TODO: mpirun compatibility echo "$infomsg Finding job launcher" if [[ -n $(which jsrun 2>/dev/null) ]]; then - source $CI_DIR/setup-lsf.sh + source $UNIFYFS_CI_DIR/setup-lsf.sh elif [[ -n $(which srun 2>/dev/null) ]]; then - source $CI_DIR/setup-slurm.sh + source $UNIFYFS_CI_DIR/setup-slurm.sh else echo >&2 "$errmsg Failed to find a suitable parallel job launcher" exit 1 @@ -174,19 +179,23 @@ export UNIFYFS_LOG_VERBOSITY=${UNIFYFS_LOG_VERBOSITY:-5} # an alternate location for the logs if [[ -z $UNIFYFS_LOG_DIR ]]; then # User can choose to not cleanup logs on success - export CI_LOG_CLEANUP=${CI_LOG_CLEANUP:-yes} - # If no log cleanup, move logs to $CI_DIR - if [[ $CI_LOG_CLEANUP =~ ^(no|NO)$ || $CI_CLEANUP =~ ^(no|NO)$ ]]; then - logdir=$CI_DIR/${SYSTEM_NAME}_${JOB_ID}_logs + export UNIFYFS_CI_LOG_CLEANUP=${UNIFYFS_CI_LOG_CLEANUP:-yes} + # If no log cleanup, move logs to $UNIFYFS_CI_DIR + if [[ $UNIFYFS_CI_LOG_CLEANUP =~ ^(no|NO)$ ]] || \ + [[ $UNIFYFS_CI_CLEANUP =~ ^(no|NO)$ ]] + then + logdir=$UNIFYFS_CI_DIR/${SYSTEM_NAME}_${JOB_ID}_logs else # else put logs in sharness trash dir that sharness deletes logdir=$SHARNESS_TRASH_DIRECTORY/${SYSTEM_NAME}_${JOB_ID}_logs - echo "$infomsg Set CI_LOG_CLEANUP=no to keep logs when all tests pass" + echo "$infomsg Set UNIFYFS_CI_LOG_CLEANUP=no to keep logs when all" \ + "tests pass" fi - mkdir -p $logdir fi export UNIFYFS_LOG_DIR=${UNIFYFS_LOG_DIR:-$logdir} +mkdir -p $UNIFYFS_LOG_DIR echo "$infomsg Logs are in UNIFYFS_LOG_DIR: $UNIFYFS_LOG_DIR" +# sharedfs export UNIFYFS_SHAREDFS_DIR=${UNIFYFS_SHAREDFS_DIR:-$UNIFYFS_LOG_DIR} echo "$infomsg UNIFYFS_SHAREDFS_DIR set as $UNIFYFS_SHAREDFS_DIR" @@ -195,50 +204,47 @@ export UNIFYFS_DAEMONIZE=${UNIFYFS_DAEMONIZE:-off} # temp nlt=${TMPDIR}/unifyfs.${USER}.${SYSTEM_NAME}.${JOB_ID} -export CI_TEMP_DIR=${CI_TEMP_DIR:-$nlt} -export UNIFYFS_RUNSTATE_DIR=${UNIFYFS_RUNSTATE_DIR:-$CI_TEMP_DIR} -export UNIFYFS_META_DB_PATH=${UNIFYFS_META_DB_PATH:-$CI_TEMP_DIR} +export UNIFYFS_CI_TEMP_DIR=${UNIFYFS_CI_TEMP_DIR:-$nlt} +$JOB_RUN_ONCE_PER_NODE mkdir -p $UNIFYFS_CI_TEMP_DIR +export UNIFYFS_RUNSTATE_DIR=${UNIFYFS_RUNSTATE_DIR:-$UNIFYFS_CI_TEMP_DIR} +export UNIFYFS_META_DB_PATH=${UNIFYFS_META_DB_PATH:-$UNIFYFS_CI_TEMP_DIR} echo "$infomsg UNIFYFS_RUNSTATE_DIR set as $UNIFYFS_RUNSTATE_DIR" echo "$infomsg UNIFYFS_META_DB_PATH set as $UNIFYFS_META_DB_PATH" -echo "$infomsg Set CI_TEMP_DIR to change both of these to same path" +echo "$infomsg Set UNIFYFS_CI_TEMP_DIR to change both of these to same path" # storage nls=$nlt -export CI_STORAGE_DIR=${CI_STORAGE_DIR:-$nls} -export UNIFYFS_SPILLOVER_SIZE=${UNIFYFS_SPILLOVER_SIZE:-$GB} -export UNIFYFS_SPILLOVER_ENABLED=${UNIFYFS_SPILLOVER_ENABLED:-yes} -export UNIFYFS_SPILLOVER_DATA_DIR=${UNIFYFS_SPILLOVER_DATA_DIR:-$CI_STORAGE_DIR} -export UNIFYFS_SPILLOVER_META_DIR=${UNIFYFS_SPILLOVER_META_DIR:-$CI_STORAGE_DIR} -echo "$infomsg UNIFYFS_SPILLOVER_DATA_DIR set as $UNIFYFS_SPILLOVER_DATA_DIR" -echo "$infomsg UNIFYFS_SPILLOVER_META_DIR set as $UNIFYFS_SPILLOVER_META_DIR" -echo "$infomsg Set CI_STORAGE_DIR to change both of these to same path" +export UNIFYFS_LOGIO_SPILL_SIZE=${UNIFYFS_LOGIO_SPILL_SIZE:-$((5 * GB))} +export UNIFYFS_LOGIO_SPILL_DIR=${UNIFYFS_LOGIO_SPILL_DIR:-$nls} +echo "$infomsg UNIFYFS_LOGIO_SPILL_SIZE set as $UNIFYFS_LOGIO_SPILL_SIZE" +echo "$infomsg UNIFYFS_LOGIO_SPILL_DIR set as $UNIFYFS_LOGIO_SPILL_DIR" ########## Set up mountpoints and sharness testing prereqs ########## # Running tests with UNIFYFS_MOUNTPOINT set to a real dir will disable posix -# tests unless user sets CI_TEST_POSIX=yes +# tests unless user sets UNIFYFS_CI_TEST_POSIX=yes export UNIFYFS_MP=${UNIFYFS_MOUNTPOINT:-/unifyfs} # If UNIFYFS_MOUNTPOINT is real dir, disable posix tests (unless user wants it) # and set REAL_MP prereq to enable test that checks if UNIFYFS_MOUNTPOINT is # empty if [[ -d $UNIFYFS_MP ]]; then - export CI_TEST_POSIX=no + export UNIFYFS_CI_TEST_POSIX=no test_set_prereq REAL_MP fi echo "$infomsg UNIFYFS_MOUNTPOINT established: $UNIFYFS_MP" -export CI_TEST_POSIX=${CI_TEST_POSIX:-yes} +export UNIFYFS_CI_TEST_POSIX=${UNIFYFS_CI_TEST_POSIX:-yes} # Set up a real mountpoint for posix tests to write files to and allow tests to # check that those files exist -if [[ ! $CI_TEST_POSIX =~ ^(no|NO)$ ]]; then - if [[ -z $CI_POSIX_MP ]]; then +if [[ ! $UNIFYFS_CI_TEST_POSIX =~ ^(no|NO)$ ]]; then + if [[ -z $UNIFYFS_CI_POSIX_MP ]]; then # needs to be a shared file system pmp=${SHARNESS_TRASH_DIRECTORY}/unify_posix_mp.${SYSTEM_NAME}.${JOB_ID} - mkdir $pmp fi - export CI_POSIX_MP=${CI_POSIX_MP:-$pmp} - echo "$infomsg CI_POSIX_MP established: $CI_POSIX_MP" + export UNIFYFS_CI_POSIX_MP=${UNIFYFS_CI_POSIX_MP:-$pmp} + mkdir -p $UNIFYFS_CI_POSIX_MP + echo "$infomsg UNIFYFS_CI_POSIX_MP established: $UNIFYFS_CI_POSIX_MP" # Set test_posix prereq test_set_prereq TEST_POSIX @@ -248,8 +254,9 @@ fi [[ -n $(which pdsh 2>/dev/null) ]] && test_set_prereq PDSH # skip cleanup_hosts test in 990-stop_server.sh if cleanup is not desired -export CI_HOST_CLEANUP=${CI_HOST_CLEANUP:-yes} -if ! [[ $CI_HOST_CLEANUP =~ ^(no|NO)$ || $CI_CLEANUP =~ ^(no|NO)$ ]]; then +export UNIFYFS_CI_HOST_CLEANUP=${UNIFYFS_CI_HOST_CLEANUP:-yes} +if ! [[ $UNIFYFS_CI_HOST_CLEANUP =~ ^(no|NO)$ ]] || \ + [[ $UNIFYFS_CI_CLEANUP =~ ^(no|NO)$ ]]; then test_set_prereq CLEAN fi diff --git a/t/ci/002-start-server.sh b/t/ci/002-start-server.sh index 8d9d0a4a5..88e834657 100755 --- a/t/ci/002-start-server.sh +++ b/t/ci/002-start-server.sh @@ -44,8 +44,8 @@ fi # If running posix tests, posix mountpoint needs to be a real, shared dir # If it's not, prereq will not be set and posix tests will be skipped -test_expect_success TEST_POSIX "CI_POSIX_MP ($CI_POSIX_MP) is shared dir" ' - test_path_is_shared_dir $CI_POSIX_MP && +test_expect_success TEST_POSIX "POSIX_MP ($UNIFYFS_CI_POSIX_MP) is shared dir" ' + test_path_is_shared_dir $UNIFYFS_CI_POSIX_MP && test_set_prereq POSIX ' @@ -55,7 +55,7 @@ test_expect_success "unifyfsd hasn't started yet" ' ' $UNIFYFS_BIN/unifyfs start -c -d -S $UNIFYFS_SHAREDFS_DIR \ - -e $UNIFYFS_BIN/unifyfsd &> ${UNIFYFS_LOG_DIR}/unifyfs.start.out & + -e $UNIFYFS_BIN/unifyfsd &> ${UNIFYFS_LOG_DIR}/unifyfs.start.out test_expect_success "unifyfsd started" ' process_is_running unifyfsd 10 || diff --git a/t/ci/100-writeread-tests.sh b/t/ci/100-writeread-tests.sh index 6969dd607..d73b735a6 100755 --- a/t/ci/100-writeread-tests.sh +++ b/t/ci/100-writeread-tests.sh @@ -68,7 +68,7 @@ unify_test_writeread() { # Evaluate output test_expect_success "$app_name $app_args: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 8 + test $lcount = 29 ' } @@ -86,11 +86,11 @@ unify_test_writeread_posix() { # Evaluate output test_expect_success POSIX "$app_name $1: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 8 && + test $lcount = 29 && if [[ $io_pattern =~ (n1)$ ]]; then - test_path_is_file ${CI_POSIX_MP}/$filename + test_path_is_file ${UNIFYFS_CI_POSIX_MP}/$filename else - test_path_has_file_per_process $CI_POSIX_MP $filename + test_path_has_file_per_process $UNIFYFS_CI_POSIX_MP $filename fi ' } @@ -215,18 +215,18 @@ unify_test_writeread $runmode "$app_args" runmode=static unify_test_writeread $runmode "$app_args" -# Increase sizes: -n 32 -c 1MB -b 16MB +# Increase sizes: -n 32 -c 4MB -b 16MB -# writeread-static -p n1 -n 32 -c 1MB -b 16MB -io_sizes="-n 32 -c $MB -b $((16 * $MB))" +# writeread-static -p n1 -n 32 -c 4MB -b 16MB +io_sizes="-n 32 -c $((4 * $MB)) -b $((16 * $MB))" app_args="$io_pattern $io_sizes" unify_test_writeread $runmode "$app_args" -# writeread-gotcha -p n1 -n 32 -c 1MB -b 16MB +# writeread-gotcha -p n1 -n 32 -c 4MB -b 16MB runmode=gotcha unify_test_writeread $runmode "$app_args" -# writeread-posix -p n1 -n 32 -c 1MB -b 16MB +# writeread-posix -p n1 -n 32 -c 4MB -b 16MB runmode=posix unify_test_writeread_posix "$app_args" @@ -234,13 +234,13 @@ unify_test_writeread_posix "$app_args" io_pattern="-p nn" app_args="$io_pattern $io_sizes" -# writeread-posix -p nn -n 32 -c 1MB -b 16MB +# writeread-posix -p nn -n 32 -c 4MB -b 16MB unify_test_writeread_posix "$app_args" -# writeread-gotcha -p nn -n 32 -c 1MB -b 16MB +# writeread-gotcha -p nn -n 32 -c 4MB -b 16MB runmode=gotcha unify_test_writeread $runmode "$app_args" -# writeread-static -p nn -n 32 -c 1MB -b 16MB +# writeread-static -p nn -n 32 -c 4MB -b 16MB runmode=static unify_test_writeread $runmode "$app_args" diff --git a/t/ci/110-write-tests.sh b/t/ci/110-write-tests.sh index bba5d1ea9..07640f5e1 100755 --- a/t/ci/110-write-tests.sh +++ b/t/ci/110-write-tests.sh @@ -68,7 +68,7 @@ unify_test_write() { # Evaluate output test_expect_success "$app_name $app_args: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 11 + test $lcount = 18 ' } @@ -86,11 +86,11 @@ unify_test_write_posix() { # Evaluate output test_expect_success POSIX "$app_name $1: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 11 && + test $lcount = 18 && if [[ $io_pattern =~ (n1)$ ]]; then - test_path_is_file ${CI_POSIX_MP}/$filename + test_path_is_file ${UNIFYFS_CI_POSIX_MP}/$filename else - test_path_has_file_per_process $CI_POSIX_MP $filename + test_path_has_file_per_process $UNIFYFS_CI_POSIX_MP $filename fi ' } @@ -215,18 +215,18 @@ unify_test_write $runmode "$app_args" runmode=static unify_test_write $runmode "$app_args" -# Increase sizes: -n 32 -c 1MB -b 16MB +# Increase sizes: -n 32 -c 4MB -b 16MB -# write-static -p n1 -n 32 -c 1MB -b 16MB -io_sizes="-n 32 -c $MB -b $((16 * $MB))" +# write-static -p n1 -n 32 -c 4MB -b 16MB +io_sizes="-n 32 -c $((4 * $MB)) -b $((16 * $MB))" app_args="$io_pattern $io_sizes" unify_test_write $runmode "$app_args" -# write-gotcha -p n1 -n 32 -c 1MB -b 16MB +# write-gotcha -p n1 -n 32 -c 4MB -b 16MB runmode=gotcha unify_test_write $runmode "$app_args" -# write-posix -p n1 -n 32 -c 1MB -b 16MB +# write-posix -p n1 -n 32 -c 4MB -b 16MB runmode=posix unify_test_write_posix "$app_args" @@ -234,13 +234,13 @@ unify_test_write_posix "$app_args" io_pattern="-p nn" app_args="$io_pattern $io_sizes" -# write-posix -p nn -n 32 -c 1MB -b 16MB +# write-posix -p nn -n 32 -c 4MB -b 16MB unify_test_write_posix "$app_args" -# write-gotcha -p nn -n 32 -c 1MB -b 16MB +# write-gotcha -p nn -n 32 -c 4MB -b 16MB runmode=gotcha unify_test_write $runmode "$app_args" -# write-static -p nn -n 32 -c 1MB -b 16MB +# write-static -p nn -n 32 -c 4MB -b 16MB runmode=static unify_test_write $runmode "$app_args" diff --git a/t/ci/120-read-tests.sh b/t/ci/120-read-tests.sh index 686c4e94f..6acc0d45a 100755 --- a/t/ci/120-read-tests.sh +++ b/t/ci/120-read-tests.sh @@ -68,7 +68,7 @@ unify_test_read() { # Evaluate output test_expect_success "$app_name $app_args: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 11 + test $lcount = 14 ' } @@ -85,7 +85,7 @@ unify_test_read_posix() { # Evaluate output test_expect_success POSIX "$app_name $1: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 11 + test $lcount = 14 ' } @@ -209,18 +209,18 @@ unify_test_read $runmode "$app_args" runmode=static unify_test_read $runmode "$app_args" -# Increase sizes: -n 32 -c 1MB -b 16MB +# Increase sizes: -n 32 -c 4MB -b 16MB -# read-static -p n1 -n 32 -c 1MB -b 16MB -io_sizes="-n 32 -c $MB -b $((16 * $MB))" +# read-static -p n1 -n 32 -c 4MB -b 16MB +io_sizes="-n 32 -c $((4 * $MB)) -b $((16 * $MB))" app_args="$io_pattern $io_sizes" unify_test_read $runmode "$app_args" -# read-gotcha -p n1 -n 32 -c 1MB -b 16MB +# read-gotcha -p n1 -n 32 -c 4MB -b 16MB runmode=gotcha unify_test_read $runmode "$app_args" -# read-posix -p n1 -n 32 -c 1MB -b 16MB +# read-posix -p n1 -n 32 -c 4MB -b 16MB runmode=posix unify_test_read_posix "$app_args" @@ -228,13 +228,13 @@ unify_test_read_posix "$app_args" io_pattern="-p nn" app_args="$io_pattern $io_sizes" -# read-posix -p n1 -n 32 -c 1MB -b 16MB +# read-posix -p n1 -n 32 -c 4MB -b 16MB unify_test_read_posix "$app_args" -# read-gotcha -p n1 -n 32 -c 1MB -b 16MB +# read-gotcha -p n1 -n 32 -c 4MB -b 16MB runmode=gotcha unify_test_read $runmode "$app_args" -# read-static -p n1 -n 32 -c 1MB -b 16MB +# read-static -p n1 -n 32 -c 4MB -b 16MB runmode=static unify_test_read $runmode "$app_args" diff --git a/t/ci/990-stop-server.sh b/t/ci/990-stop-server.sh index def649c87..a27dd5672 100755 --- a/t/ci/990-stop-server.sh +++ b/t/ci/990-stop-server.sh @@ -40,8 +40,8 @@ test_expect_success REAL_MP "Verify UNIFYFS_MOUNTPOINT ($UNIFYFS_MP) is empty" ' ' # Cleanup posix mountpoint -test_expect_success POSIX "Cleanup CI_POSIX_MP: $CI_POSIX_MP" ' - rm -rf $CI_POSIX_MP/*posix* +test_expect_success POSIX "Cleanup UNIFYFS_CI_POSIX_MP: $UNIFYFS_CI_POSIX_MP" ' + rm -rf $UNIFYFS_CI_POSIX_MP/*posix* ' # cleanup_hosts @@ -55,4 +55,11 @@ test_expect_success PDSH,CLEAN "Cleanup hosts" ' trap - EXIT # end here if running tests individually -[[ -z $full_run ]] && test_done +if [[ -z $full_run ]]; then + ( test_done; ) + test_exit_code=$? + + cd "$(dirname "$SHARNESS_TRASH_DIRECTORY")" + + return $test_exit_code +fi diff --git a/t/ci/README.md b/t/ci/README.md index 5aa441012..6614a6aff 100644 --- a/t/ci/README.md +++ b/t/ci/README.md @@ -47,16 +47,16 @@ $ prove -v RUN_CI_TESTS.sh In order to run individual tests, source the `001-setup.sh` script first, followed by `002-start-server.sh`. Then source each desired script after that -preceded by `$CI_DIR`. When finished, source the `990-stop-server.sh` script -last. +preceded by `$UNIFYFS_CI_DIR`. When finished, source the `990-stop-server.sh` +script last. E.g.: ```shell -$ . full/path/to/001-setup.sh -$ . $CI_DIR/002-start-server.sh -$ . $CI_DIR/100-writeread-tests.sh -$ . $CI_DIR/990-stop-server.sh +$ . ./001-setup.sh +$ . $UNIFYFS_CI_DIR/002-start-server.sh +$ . $UNIFYFS_CI_DIR/100-writeread-tests.sh +$ . $UNIFYFS_CI_DIR/990-stop-server.sh ``` If additional tests are desired, create a script after the fashion of diff --git a/t/ci/RUN_CI_TESTS.sh b/t/ci/RUN_CI_TESTS.sh index 48a6910a9..dd8c4034f 100755 --- a/t/ci/RUN_CI_TESTS.sh +++ b/t/ci/RUN_CI_TESTS.sh @@ -13,14 +13,14 @@ # # If individual tests are desired to be run, source the 001-setup.sh script # first, followed by 002-start-server.sh. Then source each desired script after -# that preceded by `$CI_DIR`. When finished, source the 990-stop-server.sh -# script last. +# that preceded by `$UNIFYFS_CI_DIR`. When finished, source the +# 990-stop-server.sh script last. # # E.g.: # $ . full/path/to/001-setup.sh -# $ . $CI_DIR/002-start-server.sh -# $ . $CI_DIR/100-writeread-tests.sh -# $ . $CI_DIR/990-stop-server.sh +# $ . $UNIFYFS_CI_DIR/002-start-server.sh +# $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh +# $ . $UNIFYFS_CI_DIR/990-stop-server.sh # # Before doing either of these, make sure you have interactively allocated nodes # or are submitting a batch job. @@ -51,9 +51,9 @@ Then source any desired test files. Lastly, source 990-stop-server.sh. E.g.: $ . full/path/to/001-setup.sh - $ . $CI_DIR/002-start-server.sh - $ . $CI_DIR/100-writeread-tests.sh - $ . $CI_DIR/990-stop-server.sh + $ . \$UNIFYFS_CI_DIR/002-start-server.sh + $ . \$UNIFYFS_CI_DIR/100-writeread-tests.sh + $ . \$UNIFYFS_CI_DIR/990-stop-server.sh Before doing either of these, make sure you have interactively allocated nodes or are submitting a batch job. @@ -78,18 +78,18 @@ SECONDS=0 start_time=$SECONDS echo "Started RUN_TESTS.sh @: $(date)" -# Set up CI_DIR if this script is called first -CI_DIR=${CI_DIR:-"$(dirname "$(readlink -fm $BASH_SOURCE)")"} +# Set up UNIFYFS_CI_DIR if this script is called first +UNIFYFS_CI_DIR=${UNIFYFS_CI_DIR:-"$(dirname "$(readlink -fm $BASH_SOURCE)")"} # test_done gets called in 990-stop-server.sh if this is not set. # If not set, tests can be run individually full_run=true # setup testing -source $CI_DIR/001-setup.sh +source $UNIFYFS_CI_DIR/001-setup.sh # start unifyfsd -source $CI_DIR/002-start-server.sh +source $UNIFYFS_CI_DIR/002-start-server.sh # determine time setup took setup_time=$SECONDS @@ -101,13 +101,13 @@ echo "Setup time -- $(elapsed_time start_time setup_time)" ############################################################################## # writeread example tests -source $CI_DIR/100-writeread-tests.sh +source $UNIFYFS_CI_DIR/100-writeread-tests.sh # write example tests -source $CI_DIR/110-write-tests.sh +source $UNIFYFS_CI_DIR/110-write-tests.sh # read example tests -source $CI_DIR/120-read-tests.sh +source $UNIFYFS_CI_DIR/120-read-tests.sh ############################################################################## # DO NOT add additional tests after this point @@ -117,7 +117,7 @@ testing_time=$SECONDS echo "Testing time -- $(elapsed_time setup_time testing_time)" # stop unifyfsd and cleanup -source $CI_DIR/990-stop-server.sh +source $UNIFYFS_CI_DIR/990-stop-server.sh end_time=$SECONDS echo "All done @ $(date)" diff --git a/t/ci/ci-functions.sh b/t/ci/ci-functions.sh index 1a820ee41..708a08bca 100755 --- a/t/ci/ci-functions.sh +++ b/t/ci/ci-functions.sh @@ -150,8 +150,9 @@ find_executable() local l_target="-path $2 -print -quit" local l_ret="$(find $1 -executable $l_prune $l_target)" + local l_rc=$? echo $l_ret - return 0 + return $l_rc } # Calculate the elapsed time between the two given times. @@ -160,7 +161,7 @@ find_executable() # $1 - The initial of the two times (in seconds) # $2 - The latter of the two times (in seconds) # -# Returns the elapsed time formated as HH:MM:SS +# Returns the elapsed time formatted as HH:MM:SS elapsed_time() { # USAGE: elapsed_time start_time_in_seconds end_time_in_seconds @@ -223,9 +224,9 @@ format_bytes() # # Bear in mind, the filename created in unify_run_test will have a .app suffix. # -# $1 - The app_name that will be prepended to the formated app_args in the +# $1 - The app_name that will be prepended to the formatted app_args in the # resulting filename -# $2 - The app_args that will be formated and appended to the app_name +# $2 - The app_args that will be formatted and appended to the app_name # $3 - Optional suffix to append to the end of the file # # Returns a string with the spaces removed and hyphens replaced by underscores @@ -265,26 +266,29 @@ get_filename() } # Builds the test command that will be executed. Automatically sets any options -# that are always wanted (-vkf and the appropriate -m if posix test or not). +# that are always wanted (-vkfo and the appropriate -m if posix test or not). # # Automatically builds the filename for -f based on the input app_name and # app_args and has .app appended to the end. This filename then also has .err # appended and is used for the stderr output file with JOB_RUN_COMMAND. # -# Args that can be passed in are ([-pncbx][-A|-M|-P|-S|-V]). All other args are -# set automatically. +# Args that can be passed in are ([-pncbx][-A|-M|-N|-P|-S|-V]). All other args +# are set automatically. # # $1 - Name of the example application to be tested (basetest-runmode) -# $2 - Args for $1 consisting of ([-pncbx][-A|-M|-P|-S|-V]). Encase in quotes. +# $2 - Args for $1 consisting of ([-pncbx][-A|-M|-N|-P|-S|-V]). Encase in +# quotes. # $3 - The runmode of test, used to determine if posix and set correct args # # Returns the full test command ready to be executed. build_test_command() { # USAGE: build_test_command app_exe_name app_args([-pncbx][-A|-M|-P|-S|-V]) + # runmode([static|gotcha|posix]) if [[ $# -ne 3 ]]; then echo >&2 "$errmsg USAGE: $FUNCNAME app_name" \ - "app_args([-pncbx][-A|-M|-P|-S|-V]) runmode" + "app_args([-pncbx][-A|-M|-N|-P|-S|-V])" \ + "runmode([static|gotcha|posix])" return 1 fi @@ -297,27 +301,31 @@ build_test_command() # Build example_command with options that are always wanted. Might need to # adjust for other tests (i.e., app-mpiio), or write new functions - local l_verbose="-v" local l_app_id="-a $app_id" + #local l_verbose="-v" # Sends DEBUG and test configuration info to stdout # Filename needs to be the write file if testing the read example local l_app_name=$(echo $1 | sed -r 's/(\w)-.*/\1/') if [[ $l_app_name = "read" ]]; then local l_app_filename="-f $(get_filename write-$3 "$2").app" else - #local l_check="-k" # not reliable atm local l_app_filename="-f ${l_filename}.app" fi - # Set mountpoint to an existing one if running posix test + # Set mountpoint to an existing dir & disable UnifyFS if running posix test if [[ $3 = "posix" ]]; then - local l_mount="-U -m $CI_POSIX_MP" + local l_mount="-U -m $UNIFYFS_CI_POSIX_MP" else local l_mount="-m $UNIFYFS_MP" + local l_check="-k" # read.c tests fail on posix files fi + # Add outfile option for checking line count after test completes + local l_outfile="-o ${UNIFYFS_LOG_DIR}/${l_filename}.out" + # Assemble full example_command - local l_app_args="$2 $l_app_id $l_check $l_verbose $l_mount $l_app_filename" + local l_app_args="$2 $l_app_id $l_check $l_verbose $l_mount $l_outfile \ + $l_app_filename" local l_full_app_name="${UNIFYFS_EXAMPLES}/${1} $l_app_args" # Assemble full test_command @@ -331,13 +339,13 @@ build_test_command() # testing files. # # The build_test_command is called which automatically sets any options that -# are always wanted (-vkf and appropriate -m if posix test or not). The stderr +# are always wanted (-vkfo and appropriate -m if posix test or not). The stderr # output file is also created (based on the filename that is autogenerated) and # the appropriate option is set for the JOB_RUN_COMMAND. # -# Args that can be passed in are ([-pncbx][-A|-M|-P|-S|-V]). All other args are -# set automatically, including the filename (which is generated based on the -# input app_name and app_args). +# Args that can be passed in are ([-pncbx][-A|-M|-N|-P|-S|-V]). All other args +# are set automatically, including the filename (which is generated based on +# the input app_name and app_args). # # The third parameter is an optional "pass-by-reference" parameter that can # contain the variable name for the resulting output to be stored in. @@ -346,18 +354,19 @@ build_test_command() # 2. app_output=$(unify_run_test $app_name "app_args") # # $1 - Name and runmode of the example application to be tested -# $2 - Args for $1 consisting of ([-pncbx][-A|-M|-P|-S|-V]). Encase in quotes. +# $2 - Args for $1 consisting of ([-pncbx][-A|-M|-N|-P|-S|-V]). Encase in +# quotes. # $3 - Optional output variable that is "passed by reference". # # Returns the return code of the executed example as well as the output # produced by running the example. unify_run_test() { - # USAGE: unify_run_test app_name app_args([-pncbx][-A|-M|-P|-S|-V]) + # USAGE: unify_run_test app_name app_args([-pncbx][-A|-M|-N|-P|-S|-V]) # [output_variable_name] if [[ $# -lt 2 || $# -gt 3 ]]; then echo >&2 "$errmsg USAGE: $FUNCNAME app_name" \ - "app_args([-pncbx][-A|-M|-P|-S|-V]) [output_variable_name]" + "app_args([-pncbx][-A|-M|-N|-P|-S|-V]) [output_variable_name]" return 1 fi @@ -368,14 +377,15 @@ unify_run_test() return 1 fi - # Skip this test if posix test and CI_TEST_POSIX=no|NO + # Skip this test if posix test and UNIFYFS_CI_TEST_POSIX=no|NO if ! test_have_prereq POSIX && [[ $l_runmode = "posix" ]]; then return 42 fi - # Fail if user passed in filename, mountpoint, verbose or disable - # UnifyFS since these are auto added - local opt='(-f|--file|-m|--mount|-v|--verbose|-U|--disable-unifyfs)' + # Fail if user passed in filename, check, mountpoint, outfile, verbose or + # disable UnifyFS since these are added automatically + local opt="(-f|--file|-k|--check|-m|--mount|-o|--outfile|-v|--verbose|-U|\ + --disable-unifyfs)" for s in $2; do if [[ $s =~ $opt ]]; then echo >&2 "$errmsg Call $FUNCNAME without $opt. Found $s" @@ -394,13 +404,17 @@ unify_run_test() local l_app_output; l_app_output="$($l_test_command)" local l_rc=$? - # Put the resulting output in the optional reference parameter + # Capture the outfile contents for line count checks + local l_outfile_name=$(get_filename $1 "$2" ".out") + local l_outfile_contents="$(cat ${UNIFYFS_LOG_DIR}/${l_outfile_name})" + + # Put the resulting outfile contents in the optional reference parameter local l_input_var=$3 if [[ "$l_input_var" ]]; then - eval $l_input_var="'$l_app_output'" + eval $l_input_var="'$l_outfile_contents'" fi - echo "$l_app_output" + echo "$l_outfile_contents" return $l_rc } @@ -410,6 +424,11 @@ unify_run_test() # that were leftover on the hosts. cleanup_hosts() { + if ! test_have_prereq PDSH; then + echo >&2 "$errmsg PDSH prereq not set, cleanup_hosts() skipped." + echo >&2 "$errmsg PDSH is required to run cleanup_hosts()." + return 1 + fi # Capture all output from cleanup in a log exec 3>&1 4>&2 @@ -439,11 +458,14 @@ cleanup_hosts() pdsh -w $l_hl 'test -f /dev/shm/svr_id && /bin/cat /dev/shm/svr_id' pdsh -w $l_hl 'test -f /dev/shm/unifyfsd_id && /bin/cat \ /dev/shm/unifyfsd_id' + pdsh -w $l_hl 'test -f /tmp/unifyfsd.err.* && /bin/cat \ + /tmp/unifyfsd.err.*' pdsh -w $l_hl '/bin/rm -rfv /tmp/na_sm /tmp/*unifyfs* /var/tmp/*unifyfs* \ /dev/shm/unifyfsd_id /dev/shm/svr_id /dev/shm/*na_sm* \ - "'${UNIFYFS_SPILLOVER_DATA_DIR}'"/spill*.log \ - "'${UNIFYFS_SPILLOVER_META_DIR}'"/spill*.log \ - /dev/shm/*-recv-* /dev/shm/*-req-* /dev/shm/*-super-*' + /dev/shm/logio_mem* \ + "'${UNIFYFS_LOGIO_SPILL_DIR}'"/spill*.log \ + /dev/shm/*-recv-* /dev/shm/*-req-* /dev/shm/*-super-* \ + "'$UNIFYFS_CI_TEMP_DIR'"' # Reset capturing all output exec 1>&3 2>&4 diff --git a/t/ci/setup-lsf.sh b/t/ci/setup-lsf.sh index 028483fde..c636d5146 100755 --- a/t/ci/setup-lsf.sh +++ b/t/ci/setup-lsf.sh @@ -10,8 +10,8 @@ # Returns the unique hosts being used on LSF, exluding the launch host. get_lsf_hosts() { - # NOTE: There's potential that some versions of LSF may change to where - # they put the user on the first compute node rather than a launch node. + # NOTE: It's possible that some systems with LSF may place the user on the + # first compute node rather than a launch node. local l_hosts=$(uniq $LSB_DJOB_HOSTFILE | tail -n +2) echo $l_hosts } @@ -33,31 +33,31 @@ get_hostlist() nnodes=$(get_lsf_hosts | wc -w) # Define each resource set -nprocs=${CI_NPROCS:-$nnodes} -ncores=${CI_NCORES:-20} +nprocs=${UNIFYFS_CI_NPROCS:-1} +ncores=${UNIFYFS_CI_NCORES:-20} # Total resource sets and how many per host -nrs_per_node=${CI_NRS_PER_NODE:-1} -nres_sets=${CI_NRES_SETS:-$(($nnodes * $nrs_per_node))} +nrs_per_node=${UNIFYFS_CI_NRS_PER_NODE:-1} +nres_sets=${UNIFYFS_CI_NRES_SETS:-$(($nnodes * $nrs_per_node))} if [[ $ncores -gt 20 ]]; then - echo >&2 "$errmsg Number of cores-per-resource-set (\$CI_NCORES=$ncores)" \ - "needs to be <= 20." + echo >&2 "$errmsg Number of cores-per-resource-set" \ + "(\$UNIFYFS_CI_NCORES=$ncores) needs to be <= 20." exit 1 fi if (($nres_sets % $nrs_per_node)); then echo >&2 "$errmsg Total number of resource sets ($nres_sets) must be" \ "divisible by resource-sets-per-node ($nrs_per_node). Set" \ - "\$CI_NRES_SETS and/or \$CI_NRS_PER_NODE accordingly." + "\$UNIFYFS_CI_NRES_SETS and/or \$UNIFYFS_CI_NRS_PER_NODE accordingly." exit 1 fi if [ $(($nrs_per_node * $ncores)) -gt 40 ]; then echo >&2 "$errmsg Number of cores-per-resource-set ($ncores) *"\ "resource-sets-per-node ($nrs_per_node) = $(($nrs_per_node*$ncores))" \ - "needs to be <= 40. Set \$CI_NCORES and/or \$CI_NRS_PER_NODE" \ - "accordingly." + "needs to be <= 40. Set \$UNIFYFS_CI_NCORES and/or" \ + "\$UNIFYFS_CI_NRS_PER_NODE accordingly." exit 1 fi diff --git a/t/ci/setup-slurm.sh b/t/ci/setup-slurm.sh index 48749ef35..d827cf7f1 100755 --- a/t/ci/setup-slurm.sh +++ b/t/ci/setup-slurm.sh @@ -20,7 +20,7 @@ get_hostlist() # Variables specific to SLURM nnodes=$SLURM_NNODES nres_sets=$SLURM_NNODES -nprocs=${CI_NPROCS:-$nnodes} +nprocs=${UNIFYFS_CI_NPROCS:-1} app_out="-o" app_err="-e" diff --git a/t/common/seg_tree_test.c b/t/common/seg_tree_test.c new file mode 100644 index 000000000..a2b19655b --- /dev/null +++ b/t/common/seg_tree_test.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include "seg_tree.h" +#include "t/lib/tap.h" +#include "t/lib/testutil.h" +/* + * Test our Segment Tree library + */ + +/* + * Print the seg_tree to a buffer. Returns dst so we can directly print the + * result. + */ +char* print_tree(char* dst, struct seg_tree* seg_tree) +{ + int ptr = 0; + struct seg_tree_node* node = NULL; + + /* In case we don't actually print anything */ + dst[0] = '\0'; + + seg_tree_rdlock(seg_tree); + while ((node = seg_tree_iter(seg_tree, node))) { + ptr += sprintf(&dst[ptr], "[%lu-%lu:%lu]", node->start, node->end, + node->ptr); + } + seg_tree_unlock(seg_tree); + return dst; +} + +int main(int argc, char** argv) +{ + struct seg_tree seg_tree; + char tmp[255]; + unsigned long max, count; + struct seg_tree_node* node; + + plan(NO_PLAN); + + seg_tree_init(&seg_tree); + + /* Initial insert */ + seg_tree_add(&seg_tree, 5, 10, 0); + is("[5-10:0]", print_tree(tmp, &seg_tree), "Initial insert works"); + + /* Non-overlapping insert */ + seg_tree_add(&seg_tree, 100, 150, 100); + is("[5-10:0][100-150:100]", print_tree(tmp, &seg_tree), + "Non-overlapping works"); + + /* Add range overlapping part of the left size */ + seg_tree_add(&seg_tree, 2, 7, 200); + is("[2-7:200][8-10:3][100-150:100]", print_tree(tmp, &seg_tree), + "Left size overlap works"); + + /* Add range overlapping part of the right size */ + seg_tree_add(&seg_tree, 9, 12, 300); + is("[2-7:200][8-8:3][9-12:300][100-150:100]", print_tree(tmp, &seg_tree), + "Right size overlap works"); + + /* Add range totally within another range */ + seg_tree_add(&seg_tree, 3, 4, 400); + is("[2-2:200][3-4:400][5-7:203][8-8:3][9-12:300][100-150:100]", + print_tree(tmp, &seg_tree), "Inside range works"); + + /* Test counts */ + max = seg_tree_max(&seg_tree); + count = seg_tree_count(&seg_tree); + ok(max == 150, "max is 150 (got %lu)", max); + ok(count == 6, "count is 6 (got %lu)", count); + + /* Add a range that blows away multiple ranges, and overlaps */ + seg_tree_add(&seg_tree, 4, 120, 500); + is("[2-2:200][3-3:400][4-120:500][121-150:121]", print_tree(tmp, &seg_tree), + "Blow away multiple ranges works"); + + /* Test counts */ + max = seg_tree_max(&seg_tree); + count = seg_tree_count(&seg_tree); + ok(max == 150, "max is 150 (got %lu)", max); + ok(count == 4, "count is 4 (got %lu)", count); + + seg_tree_clear(&seg_tree); + is("", print_tree(tmp, &seg_tree), "seg_tree_clear() works"); + + max = seg_tree_max(&seg_tree); + count = seg_tree_count(&seg_tree); + ok(max == 0, "max 0 (got %lu)", max); + ok(count == 0, "count is 0 (got %lu)", count); + + /* + * Now let's write a long extent, and then sawtooth over it with 1 byte + * extents. + */ + seg_tree_add(&seg_tree, 0, 50, 50); + seg_tree_add(&seg_tree, 0, 0, 0); + seg_tree_add(&seg_tree, 2, 2, 2); + seg_tree_add(&seg_tree, 4, 4, 4); + seg_tree_add(&seg_tree, 6, 6, 6); + is("[0-0:0][1-1:51][2-2:2][3-3:53][4-4:4][5-5:55][6-6:6][7-50:57]", + print_tree(tmp, &seg_tree), "Sawtooth extents works"); + + max = seg_tree_max(&seg_tree); + count = seg_tree_count(&seg_tree); + ok(max == 50, "max 50 (got %lu)", max); + ok(count == 8, "count is 8 (got %lu)", count); + + /* + * Test seg_tree_find(). Find between a range that multiple segments. It + * should return the first one. + */ + node = seg_tree_find(&seg_tree, 2, 7); + ok(node->start == 2 && node->end == 2, "seg_tree_find found correct node"); + + /* Test finding a segment that partially overlaps our range */ + seg_tree_add(&seg_tree, 100, 200, 100); + node = seg_tree_find(&seg_tree, 90, 120); + ok(node->start == 100 && node->end == 200, + "seg_tree_find found partial overlapping node"); + + /* Look for a range that doesn't exist. Should return NULL. */ + node = seg_tree_find(&seg_tree, 2000, 3000); + ok(node == NULL, "seg_tree_find correctly returned NULL"); + + /* + * Write a range, then completely overwrite it with the + * same range. Use a different buf value to verify it changed. + */ + seg_tree_clear(&seg_tree); + seg_tree_add(&seg_tree, 20, 30, 0); + is("[20-30:0]", print_tree(tmp, &seg_tree), "Initial [20-30] write works"); + + seg_tree_add(&seg_tree, 20, 30, 8); + is("[20-30:8]", print_tree(tmp, &seg_tree), "Same range overwrite works"); + + /* Test coalescing */ + seg_tree_clear(&seg_tree); + seg_tree_add(&seg_tree, 5, 10, 105); + is("[5-10:105]", print_tree(tmp, &seg_tree), "Initial insert works"); + + /* Non-overlapping insert */ + seg_tree_add(&seg_tree, 100, 150, 200); + is("[5-10:105][100-150:200]", print_tree(tmp, &seg_tree), + "Non-overlapping works"); + + /* + * Add range overlapping part of the left size. + * Check that it coalesces + */ + seg_tree_add(&seg_tree, 2, 7, 102); + is("[2-10:102][100-150:200]", print_tree(tmp, &seg_tree), + "Left size overlap works"); + + /* + * Add range overlapping part of the right size. + * Check that is coalesces. + */ + seg_tree_add(&seg_tree, 9, 12, 109); + is("[2-12:102][100-150:200]", print_tree(tmp, &seg_tree), + "Right size overlap works"); + + /* + * Add range totally within another range. + * Check that it is consumed. + */ + seg_tree_add(&seg_tree, 3, 4, 103); + is("[2-12:102][100-150:200]", + print_tree(tmp, &seg_tree), "Inside range works"); + + /* Test counts */ + max = seg_tree_max(&seg_tree); + count = seg_tree_count(&seg_tree); + ok(max == 150, "max is 150 (got %lu)", max); + ok(count == 2, "count is 2 (got %lu)", count); + + /* Add a range that connects two other ranges. */ + seg_tree_add(&seg_tree, 4, 120, 104); + is("[2-150:102]", print_tree(tmp, &seg_tree), + "Connect two ranges works"); + + /* Test counts */ + max = seg_tree_max(&seg_tree); + count = seg_tree_count(&seg_tree); + ok(max == 150, "max is 150 (got %lu)", max); + ok(count == 1, "count is 1 (got %lu)", count); + + seg_tree_clear(&seg_tree); + seg_tree_add(&seg_tree, 0, 0, 0); + seg_tree_add(&seg_tree, 1, 10, 101); + seg_tree_add(&seg_tree, 20, 30, 20); + seg_tree_add(&seg_tree, 31, 40, 131); + + /* Remove a single entry */ + seg_tree_remove(&seg_tree, 0, 0); + ok(1 == 1, "removed a single entry, got %s", print_tree(tmp, &seg_tree)); + is("[1-10:101][20-30:20][31-40:131]", print_tree(tmp, &seg_tree), + "removed a single range, got %s", print_tree(tmp, &seg_tree)); + + /* Remove a range spanning the two bordering ranges [20-30] & [31-40]. */ + seg_tree_remove(&seg_tree, 25, 31); + is("[1-10:101][20-24:20][32-40:132]", print_tree(tmp, &seg_tree), + "removed a range that truncated two entries, got %s", + print_tree(tmp, &seg_tree)); + + seg_tree_clear(&seg_tree); + seg_tree_destroy(&seg_tree); + + done_testing(); + + return 0; +} diff --git a/t/common/slotmap_test.c b/t/common/slotmap_test.c new file mode 100644 index 000000000..fe4a26b53 --- /dev/null +++ b/t/common/slotmap_test.c @@ -0,0 +1,125 @@ +#include "slotmap.h" + +#include +#include +#include + +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +struct reservation { + size_t slot; + size_t count; +}; + +int main(int argc, char** argv) +{ + int rc; + + /* process test args */ + size_t num_slots = 4096; + if (argc > 1) { + num_slots = (size_t) atoi(argv[1]); + } + + size_t num_inserts = 100; + if (argc > 2) { + num_inserts = (size_t) atoi(argv[2]); + } + + int page_multiple = 1; + if (argc > 3) { + page_multiple = atoi(argv[3]); + } + + unsigned int rand_seed = 12345678; + if (argc > 4) { + rand_seed = (unsigned int) atoi(argv[4]); + } + srand(rand_seed); + + plan(NO_PLAN); + + /* allocate an array of reservations to remove */ + size_t num_removes = num_inserts / 2; + struct reservation* to_remove = (struct reservation*) + calloc(num_removes, sizeof(struct reservation)); + if (NULL == to_remove) { + BAIL_OUT("calloc() for reservation array failed!"); + } + + /* allocate buffer to hold a slot map */ + size_t page_sz = sysconf(_SC_PAGESIZE); + printf("# NOTE: page size is %zu bytes\n", page_sz); + size_t buf_sz = page_sz * page_multiple; + void* buf = malloc(buf_sz); + if (NULL == buf) { + BAIL_OUT("ERROR: malloc(%zu) for slot map buffer failed!\n", buf_sz); + } + + slot_map* smap = slotmap_init(num_slots, buf, buf_sz); + ok(NULL != smap, "create slot map with %zu slots", num_slots); + if (NULL == smap) { + done_testing(); // will exit program + } + + size_t remove_ndx = 0; + size_t success_count = 0; + for (size_t i = 0; i < num_inserts; i++) { + size_t cnt = (size_t)rand() % 18; + if (0 == cnt) { + cnt++; + } + + ssize_t slot = slotmap_reserve(smap, cnt); + if (-1 != slot) { + success_count++; + printf("# - reserved %2zu slots (start = %zu)\n", + cnt, (size_t)slot); + + /* pick some successful reservations to test removal */ + if ((cnt > 4) && (remove_ndx < num_removes)) { + struct reservation* rsvp = to_remove + remove_ndx; + rsvp->slot = (size_t)slot; + rsvp->count = cnt; + remove_ndx++; + } + } else { + printf("# FAILED to reserve %2zu slots\n", cnt); + } + } + ok(success_count == num_inserts, + "all slotmap_reserve() calls succeeded (%zu of %zu), %zu slots used", + success_count, num_inserts, smap->used_slots); + + slotmap_print(smap); + + success_count = 0; + size_t release_count = 0; + for (size_t i = 0; i < num_removes; i++) { + struct reservation* rsvp = to_remove + i; + if (rsvp->count > 0) { + release_count++; + rc = slotmap_release(smap, rsvp->slot, rsvp->count); + if (0 == rc) { + success_count++; + printf("# - released %2zu slots (start = %zu)\n", + rsvp->count, rsvp->slot); + } else { + printf("# FAILED to release %2zu slots (start = %zu)\n", + rsvp->count, rsvp->slot); + } + } + } + ok(success_count == release_count, + "all slotmap_release() calls succeeded (%zu of %zu)", + success_count, release_count); + + slotmap_print(smap); + + rc = slotmap_clear(smap); + ok(rc == 0, "clear the slotmap"); + + done_testing(); +} + diff --git a/t/lib/Makefile.am b/t/lib/Makefile.am index f3ddbc7fc..aab557e23 100644 --- a/t/lib/Makefile.am +++ b/t/lib/Makefile.am @@ -1,4 +1,4 @@ -AM_CFLAGS = -Wall +AM_CFLAGS = -Wall -Werror AM_CPPFLAGS = diff --git a/t/lib/testutil.c b/t/lib/testutil.c index 75ec78a2a..26373ace6 100644 --- a/t/lib/testutil.c +++ b/t/lib/testutil.c @@ -12,10 +12,12 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ +#include #include #include +#include #include -#include "testutil.h" +#include static unsigned long seed; @@ -28,7 +30,7 @@ static unsigned long seed; * be run individually if need be. If they run too fast, seeding srand() with * time(NULL) can happen more than once in a second, causing the pseudo random * sequence to repeat which causes each suite to create the same random files. - * Using gettimeofday() allows us to increase the granularty to microseconds. + * Using gettimeofday() allows us to increase the granularity to microseconds. */ static void test_util_srand(void) { @@ -61,7 +63,6 @@ void testutil_rand_string(char* buf, size_t len) idx = rand() % (sizeof(charset) - 1); buf[i] = charset[idx]; } - buf[i] = '\0'; } @@ -74,20 +75,21 @@ void testutil_rand_path(char* buf, size_t len, const char* pfx) { int rc; + memset(buf, 0, len); rc = snprintf(buf, len, "%s/", pfx); testutil_rand_string(buf + rc, len - rc); } /* * Return a pointer to the path name of the UnifyFS mount point. Use the - * value of the environment variable UNIFYFS_MOUNT_POINT if it exists, + * value of the environment variable UNIFYFS_MOUNTPOINT if it exists, * otherwise use P_tmpdir which is defined in stdio.h and is typically * /tmp. */ char* testutil_get_mount_point(void) { char* path; - char* env = getenv("UNIFYFS_MOUNT_POINT"); + char* env = getenv("UNIFYFS_MOUNTPOINT"); if (env != NULL) { path = env; @@ -97,3 +99,20 @@ char* testutil_get_mount_point(void) return path; } + +/* Stat the file associated to by path and store the global size of the + * file at path in the address of the global pointer passed in. */ +void testutil_get_size(char* path, size_t* global) +{ + struct stat sb = {0}; + int rc; + + rc = stat(path, &sb); + if (rc != 0) { + printf("Error: %s\n", strerror(errno)); + exit(1); + } + if (global) { + *global = sb.st_size; + } +} diff --git a/t/lib/testutil.h b/t/lib/testutil.h index 001885914..58b628df6 100644 --- a/t/lib/testutil.h +++ b/t/lib/testutil.h @@ -31,3 +31,7 @@ void testutil_rand_path(char* buf, size_t len, const char* pfx); * /tmp. */ char* testutil_get_mount_point(void); + +/* Stat the file associated to by path and store the global size of the + * file at path in the address of the global pointer passed in. */ +void testutil_get_size(char* path, size_t* global); diff --git a/t/server/metadata_suite.c b/t/server/metadata_suite.c deleted file mode 100644 index 88d7c31c3..000000000 --- a/t/server/metadata_suite.c +++ /dev/null @@ -1,80 +0,0 @@ -#include - -#include "unifyfs_configurator.h" -#include "unifyfs_metadata.h" -#include "unifyfs_log.h" -#include "unifyfs_runstate.h" - -#include "t/lib/tap.h" - -#include "metadata_suite.h" - -int main(int argc, char* argv[]) -{ - /* need to initialize enougth of the server to use the metadata API */ - unifyfs_cfg_t server_cfg; - int rc, provided, glb_rank, glb_size; - - /* get the configuration */ - rc = unifyfs_config_init(&server_cfg, argc, argv); - if (rc != 0) { - exit(1); - } - - rc = unifyfs_write_runstate(&server_cfg); - if (rc != (int)UNIFYFS_SUCCESS) { - exit(1); - } - - rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); - - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = MPI_Comm_rank(MPI_COMM_WORLD, &glb_rank); - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = MPI_Comm_size(MPI_COMM_WORLD, &glb_size); - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = meta_init_store(&server_cfg); - if (rc != 0) { - LOG(LOG_ERR, "%s", - unifyfs_error_enum_description(UNIFYFS_ERROR_MDINIT)); - exit(1); - } - - /* - * necessary infrastructure is initialized - * running tests - */ - - plan(NO_PLAN); - - // keep the order - - unifyfs_set_file_attribute_test(); - unifyfs_get_file_attribute_test(); - - - /* - * shut down infrastructure - */ - - // shutdown the metadata service - meta_sanitize(); - - // finalize mpi - MPI_Finalize(); - - // finish the testing - // needs to be last call - done_testing(); - - return EXIT_SUCCESS; -} diff --git a/t/server/metadata_suite.h b/t/server/metadata_suite.h deleted file mode 100644 index 3afcce2c2..000000000 --- a/t/server/metadata_suite.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2019, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2019, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - - -/* This is the collection of metadata tests to be run inside of - * metadata_suite.c. These tests are testing the wrapper functions found in - * server/src/unifyfs_metadata.c. - */ - - -#ifndef METADATA_SUITE_H -#define METADATA_SUITE_H - -int unifyfs_set_file_attribute_test(void); -int unifyfs_get_file_attribute_test(void); -int unifyfs_get_file_extents_test(void); - -#endif /* METADATA_SUITE_H */ diff --git a/t/server/unifyfs_meta_get_test.c b/t/server/unifyfs_meta_get_test.c deleted file mode 100644 index db2c304e6..000000000 --- a/t/server/unifyfs_meta_get_test.c +++ /dev/null @@ -1,60 +0,0 @@ -#include - -#include "metadata_suite.h" -#include "unifyfs_meta.h" -#include "unifyfs_metadata.h" -#include "t/lib/tap.h" - -#define TEST_META_GFID_VALUE 0xbeef -#define TEST_META_FID_VALUE 0xfeed -#define TEST_META_FILE "/unifyfs/filename/to/nowhere" - -int unifyfs_set_file_attribute_test(void) -{ - int rc; - - /* create dummy file attribute */ - unifyfs_file_attr_t fattr = {0}; - - fattr.gfid = TEST_META_GFID_VALUE; - fattr.fid = TEST_META_FID_VALUE; - snprintf(fattr.filename, sizeof(fattr.filename), TEST_META_FILE); - fflush(NULL); - - rc = unifyfs_set_file_attribute(&fattr); - ok(UNIFYFS_SUCCESS == rc, "Stored file attribute"); - fflush(NULL); - return 0; -} - -int unifyfs_get_file_attribute_test(void) -{ - int rc; - unifyfs_file_attr_t fattr; - - rc = unifyfs_get_file_attribute(TEST_META_GFID_VALUE, &fattr); - ok(UNIFYFS_SUCCESS == rc && - TEST_META_GFID_VALUE == fattr.gfid && - TEST_META_FID_VALUE == fattr.fid && - (0 == strcmp(fattr.filename, TEST_META_FILE)), - "Retrieve file attributes (rc = %d, gfid = 0x%02X, fid = 0x%02X)", - rc, fattr.gfid, fattr.fid - ); - return 0; -} - -// this test is not run right now -int unifyfs_get_file_extents_test(void) -{ - int rc, num_values, num_keys; - int key_lens[16]; - unifyfs_key_t keys[16]; - unifyfs_keyval_t keyval[16]; - - rc = unifyfs_get_file_extents(num_keys, &keys, key_lens, - &num_values, &keyval); - ok(UNIFYFS_SUCCESS == rc, - "Retrieved file extents (rc = %d)", rc - ); - return 0; -} diff --git a/t/sharness.d/00-test-env.sh b/t/sharness.d/00-test-env.sh index 5ab26fa28..6efd504a0 100644 --- a/t/sharness.d/00-test-env.sh +++ b/t/sharness.d/00-test-env.sh @@ -27,15 +27,14 @@ if test -n "$(which jsrun 2>/dev/null)"; then elif test -n "$(which srun 2>/dev/null)"; then JOB_RUN_COMMAND="srun -n1 -N1" elif test -n "$(which mpirun 2>/dev/null)"; then - JOB_RUN_COMMAND="mpirun -wd $UNIFYFS_BUILD_DIR -np 1" + JOB_RUN_COMMAND="mpirun -np 1" fi - if test -z "$JOB_RUN_COMMAND"; then echo >&2 "Failed to find a suitable parallel job launcher" echo >&2 "Do you need to install OpenMPI or SLURM?" return 1 fi - +#echo >&2 "Using JOB_RUN_COMMAND: $JOB_RUN_COMMAND" export JOB_RUN_COMMAND # diff --git a/t/sharness.d/01-unifyfs-settings.sh b/t/sharness.d/01-unifyfs-settings.sh index a3155f3cc..a44d3fc1d 100644 --- a/t/sharness.d/01-unifyfs-settings.sh +++ b/t/sharness.d/01-unifyfs-settings.sh @@ -5,33 +5,19 @@ # # Source a script that is dynamically generated by 0001-setup.t. # -. $UNIFYFS_TEST_RUN_SCRIPT - +source $UNIFYFS_TEST_RUN_SCRIPT # Common settings -UNIFYFS_MOUNTPOINT=${UNIFYFS_MOUNT_POINT:-$(mktemp -d)} -export UNIFYFS_MOUNTPOINT +export UNIFYFS_LOG_VERBOSITY=${UNIFYFS_LOG_VERBOSITY:-5} +export UNIFYFS_MOUNTPOINT=${UNIFYFS_TEST_MOUNT:-"/unifyfs"} # Server settings -UNIFYFS_META_DB_PATH=${UNIFYFS_META_DB_PATH:-$(mktemp -d)} -UNIFYFS_META_DB_NAME=${UNIFYFS_META_DB_NAME:-unifyfs_db} -UNIFYFS_META_SERVER_RATIO=${UNIFYFS_META_SERVER_RATIO:-1} -UNIFYFS_LOG_DIR=${UNIFYFS_LOG_DIRECTORY:-$UNIFYFS_META_DB_PATH} -UNIFYFS_LOG_FILE=${UNIFYFS_LOG_FILE:-unifyfsd_debuglog} -UNIFYFS_RUNSTATE_DIR=${UNIFYFS_RUNSTATE_DIR:-$UNIFYFS_META_DB_PATH} -UNIFYFS_SHAREDFS_DIR=${UNIFYFS_SHAREDFS_DIR:-$UNIFYFS_META_DB_PATH} -export UNIFYFS_LOG_DIR -export UNIFYFS_LOG_FILE -export UNIFYFS_META_DB_NAME -export UNIFYFS_META_DB_PATH -export UNIFYFS_META_SERVER_RATIO -export UNIFYFS_RUNSTATE_DIR -export UNIFYFS_SHAREDFS_DIR +export UNIFYFS_LOG_DIR=${UNIFYFS_LOG_DIR:-$UNIFYFS_TEST_STATE} +export UNIFYFS_LOG_FILE="unifyfsd.log" +export UNIFYFS_META_DB_PATH=${UNIFYFS_TEST_META} +export UNIFYFS_RUNSTATE_DIR=${UNIFYFS_TEST_STATE} +export UNIFYFS_SHAREDFS_DIR=${UNIFYFS_TEST_SHARE} # Client settings -UNIFYFS_SPILLOVER_ENABLED=${UNIFYFS_SPILLOVER_ENABLED:-"Y"} -UNIFYFS_SPILLOVER_DATA_DIR=${UNIFYFS_SPILLOVER_DATA_DIR:-$UNIFYFS_META_DB_PATH} -UNIFYFS_SPILLOVER_META_DIR=${UNIFYFS_SPILLOVER_META_DIR:-$UNIFYFS_META_DB_PATH} -export UNIFYFS_SPILLOVER_DATA_DIR -export UNIFYFS_SPILLOVER_META_DIR -export UNIFYFS_SPILLOVER_ENABLED +export UNIFYFS_LOGIO_SPILL_SIZE=$((5 * (2 ** 30))) +export UNIFYFS_LOGIO_SPILL_DIR=${UNIFYFS_TEST_SPILL} diff --git a/t/sharness.d/02-functions.sh b/t/sharness.d/02-functions.sh old mode 100755 new mode 100644 index e4bb0b256..4aa91bc20 --- a/t/sharness.d/02-functions.sh +++ b/t/sharness.d/02-functions.sh @@ -61,31 +61,85 @@ process_is_not_running() return 1 } +# Dump test state (for debugging) +unifyfsd_dump_state() +{ + if ! test -d "$UNIFYFS_TEST_TMPDIR"; then + return 1 + fi + + dumpfile=$UNIFYFS_TEST_TMPDIR/unifyfsd.dump.$$ + [ -f $dumpfile ] || touch $dumpfile + + metadir=$UNIFYFS_TEST_META + if [ -d $metadir ]; then + echo "Listing meta directory $metadir :" >> $dumpfile + ls -lR $metadir >> $dumpfile + echo >> $dumpfile + fi + + sharedir=$UNIFYFS_TEST_SHARE + if [ -d $sharedir ]; then + echo "Listing share directory $sharedir :" >> $dumpfile + ls -lR $sharedir >> $dumpfile + echo >> $dumpfile + fi + + spilldir=$UNIFYFS_TEST_SPILL + if [ -d $spilldir ]; then + echo "Listing spill directory $spilldir :" >> $dumpfile + ls -lR $spilldir >> $dumpfile + echo >> $dumpfile + fi + + statedir=$UNIFYFS_TEST_STATE + if [ -d $statedir ]; then + echo "Listing state directory $statedir :" >> $dumpfile + ls -lR $statedir >> $dumpfile + echo >> $dumpfile + echo "Dumping state directory $statedir file contents :" >> $dumpfile + for f in $statedir/* ; do + if [ -f $f ]; then + echo "========= $f ==========" >> $dumpfile + cat $f >> $dumpfile + echo "+++++++++++++++++++++++" >> $dumpfile + echo >> $dumpfile + fi + done + fi + # print out dumpfile contents to current test log + cat $dumpfile >&3 + return 0 +} + +# Remove the test directory. +unifyfsd_cleanup() +{ + unifyfsd_dump_state + # remove test directory if it exists + test -d "$UNIFYFS_TEST_TMPDIR" && /bin/rm -r $UNIFYFS_TEST_TMPDIR + return 0 +} + # Create metadata directory if needed and start daemon. unifyfsd_start_daemon() { - # Make sure metadata directory exists - if test -z "$UNIFYFS_META_DB_PATH"; then - return 1 - elif ! test -d "$UNIFYFS_META_DB_PATH" && - ! mkdir $UNIFYFS_META_DB_PATH; then + # Make sure test directory exists + if ! test -d "$UNIFYFS_TEST_TMPDIR"; then return 1 fi # Generate servers hostfile - # if test -z "$UNIFYFS_SHAREDFS_DIR"; then - # return 1 - # elif ! test -d "$UNIFYFS_SHAREDFS_DIR" && - # ! mkdir $UNIFYFS_SHAREDFS_DIR; then - # return 1 - # fi - # srvr_hosts=$UNIFYFS_SHAREDFS_DIR/unifyfsd.hosts - # if [ ! -f $srvr_hosts ]; then - # touch $srvr_hosts - # echo "1" >> $srvr_hosts - # hostname >> $srvr_hosts - # fi - # export UNIFYFS_SERVER_HOSTFILE=$srvr_hosts + if test -z "$UNIFYFS_SHAREDFS_DIR"; then + return 1 + fi + srvr_hosts=$UNIFYFS_SHAREDFS_DIR/unifyfsd.hosts + if [ ! -f $srvr_hosts ]; then + touch $srvr_hosts + echo "1" >> $srvr_hosts + hostname >> $srvr_hosts + fi + export UNIFYFS_SERVER_HOSTFILE=$srvr_hosts # run server daemon $UNIFYFSD @@ -94,12 +148,14 @@ unifyfsd_start_daemon() # Kill UnifyFS daemon. unifyfsd_stop_daemon() { - while killall -q -s TERM unifyfsd 2>/dev/null; do :; done + killsig="TERM" + srvrpids="$(pgrep unifyfsd)" + while [ -n "$srvrpids" ]; do + killall -q -s $killsig unifyfsd 2>/dev/null + sleep 5 + srvrpids="$(pgrep unifyfsd)" + killsig="KILL" + done } -# Remove the metadata directory. -unifyfsd_cleanup() -{ - test -d "$UNIFYFS_META_DB_PATH" && rm -rf $UNIFYFS_META_DB_PATH - # test -d "$UNIFYFS_SHAREDFS_DIR" && rm -rf $UNIFYFS_SHAREDFS_DIR -} + diff --git a/t/std/fflush.c b/t/std/fflush.c new file mode 100644 index 000000000..1daa146b7 --- /dev/null +++ b/t/std/fflush.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test fflush(). Currently this test is skipped until #374 is addressed. + */ +#include +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +int fflush_test(char* unifyfs_root) +{ + char path[64]; + char buf[64] = {0}; + FILE* fp = NULL; + int rc; + + /* Generate a random file name in the mountpoint path to test on */ + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Write "hello world" to a file */ + fp = fopen(path, "w"); + ok(fp != NULL, "%s: fopen(%s): %s", __FILE__, path, strerror(errno)); + + rc = fwrite("hello world", 12, 1, fp); + ok(rc == 1, "%s: fwrite(\"hello world\"): %s", __FILE__, strerror(errno)); + + /* Flush the extents */ + rc = fflush(fp); + ok(rc == 0, "%s: fflush() (rc=%d): %s", __FILE__, rc, strerror(errno)); + + rc = fclose(fp); + ok(rc == 0, "%s: fclose() (rc=%d): %s", __FILE__, rc, strerror(errno)); + + /* Laminate */ + rc = chmod(path, 0444); + ok(rc == 0, "%s: chmod(0444) (rc=%d): %s", __FILE__, strerror(errno)); + + /* Read it back */ + fp = fopen(path, "r"); + ok(fp != NULL, "%s: fopen(%s): %s", __FILE__, path, strerror(errno)); + + rc = fread(buf, 12, 1, fp); + ok(rc == 1, "%s: fread() buf[]=\"%s\", (rc %d): %s", __FILE__, buf, rc, + strerror(errno)); + is(buf, "hello world", "%s: saw \"hello world\"", __FILE__); + + rc = fclose(fp); + ok(rc == 0, "%s: fclose() (rc=%d): %s", __FILE__, rc, strerror(errno)); + + //end_skip; + + return 0; +} diff --git a/t/std/fopen-fclose.c b/t/std/fopen-fclose.c index 3e1995714..9f9d9d850 100644 --- a/t/std/fopen-fclose.c +++ b/t/std/fopen-fclose.c @@ -34,51 +34,76 @@ int fopen_fclose_test(char* unifyfs_root) char path[64]; char path2[64]; FILE* fd = NULL; - int rc; + int err, rc; + /* Generate a random file name in the mountpoint path to test on */ testutil_rand_path(path, sizeof(path), unifyfs_root); testutil_rand_path(path2, sizeof(path2), unifyfs_root); + /* Verify fopen a non-existent file as read-only fails with errno=ENOENT. */ + errno = 0; + fd = fopen(path, "r"); + err = errno; + ok(fd == NULL && err == ENOENT, + "%s:%d fopen non-existent file %s w/ mode r: %s", + __FILE__, __LINE__, path, strerror(err)); + /* Verify we can create a new file. */ errno = 0; fd = fopen(path, "w"); - ok(fd != NULL, "fopen non-existing file %s mode w: %s", - path, strerror(errno)); + err = errno; + ok(fd != NULL && err == 0, + "%s:%d fopen non-existing file %s w/ mode w: %s", + __FILE__, __LINE__, path, strerror(err)); /* Verify close succeeds. */ errno = 0; rc = fclose(fd); - ok(rc == 0, "fclose new file %s (rc=%d): %s", path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fclose new file: %s", + __FILE__, __LINE__, strerror(err)); /* Verify we can create a new file with mode "a". */ errno = 0; fd = fopen(path2, "a"); - ok(fd != NULL, "fopen non-existing file %s mode a: %s", - path2, strerror(errno)); + err = errno; + ok(fd != NULL && err == 0, + "%s:%d fopen non-existing file %s mode a: %s", + __FILE__, __LINE__, path2, strerror(err)); /* Verify close succeeds. */ errno = 0; rc = fclose(fd); - ok(rc == 0, "fclose new file %s (rc=%d): %s", path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fclose new file: %s", + __FILE__, __LINE__, strerror(err)); /* Verify opening an existing file with mode "r" succeeds. */ errno = 0; fd = fopen(path, "r"); - ok(fd != NULL, "fopen existing file %s mode r: %s", - path, strerror(errno)); + err = errno; + ok(fd != NULL && err == 0, + "%s:%d fopen existing file %s mode r: %s", + __FILE__, __LINE__, path, strerror(err)); /* Verify close succeeds. */ errno = 0; rc = fclose(fd); - ok(rc == 0, "fclose %s (rc=%d): %s", path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fclose worked: %s", + __FILE__, __LINE__, strerror(err)); /* Verify closing already closed file fails with errno=EBADF */ errno = 0; rc = fclose(fd); - ok(rc < 0 && errno == EBADF, - "fclose already closed file %s should fail (rc=%d, errno=%d): %s", - path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d fclose already closed file %s should fail (errno=%d): %s", + __FILE__, __LINE__, path, err, strerror(err)); diag("Finished UNIFYFS_WRAP(fopen/fclose) tests"); diff --git a/t/std/fseek-ftell.c b/t/std/fseek-ftell.c new file mode 100644 index 000000000..537dd46c3 --- /dev/null +++ b/t/std/fseek-ftell.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2018, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include + +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +/* This function contains the tests for UNIFYFS_WRAP(fseek), + * UNIFYFS_WRAP(ftell), and UNIFYFS_WRAP(rewind) found in + * client/src/unifyfs-stdio.c. + * + * Notice the tests are ordered in a logical testing order. Changing the order + * or adding new tests in between two others could negatively affect the + * desired results. */ +int fseek_ftell_test(char* unifyfs_root) +{ + /* Diagnostic message for reading and debugging output */ + diag("Starting UNIFYFS_WRAP(fseek/ftell/rewind) tests"); + + char path[64]; + FILE* fp = NULL; + int err, rc; + + /* Create a random file at the mountpoint path to test on */ + testutil_rand_path(path, sizeof(path), unifyfs_root); + + skip(1, 3, "causing a hang on some architectures. Try after future update"); + /* fseek on bad file stream should fail with errno=EBADF */ + dies_ok({ fseek(fp, 0, SEEK_SET); }, + "%s:%d fseek on bad file stream segfaults", + __FILE__, __LINE__); + + /* ftell on non-open file stream should fail with errno=EBADF + * variable declaration and `ok` test are to avoid a compiler warning */ + dies_ok({ int rc = ftell(fp); ok(rc > 0); }, + "%s:%d ftell on bad file stream segfaults", + __FILE__, __LINE__); + + /* rewind on non-open file stream should fail with errno=EBADF */ + dies_ok({ rewind(fp); }, "%s:%d rewind on bad file stream segfaults", + __FILE__, __LINE__); + end_skip; + + /* Open a file and write to it to test fseek() */ + errno = 0; + fp = fopen(path, "w"); + err = errno; + ok(fp != NULL && err == 0, "%s:%d fopen(%s): %s", + __FILE__, __LINE__, path, strerror(err)); + + errno = 0; + rc = (int) fwrite("hello world", 12, 1, fp); + err = errno; + ok(rc == 1 && err == 0, + "%s:%d fwrite() to file %s: %s", + __FILE__, __LINE__, path, strerror(err)); + + /* fseek with invalid whence fails with errno=EINVAL. */ + errno = 0; + rc = (int) fseek(fp, 0, -1); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d fseek with invalid whence should fail (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /*--- fseek() with SEEK_SET tests ---*/ + + /* fseek to negative offset with SEEK_SET should fail with errno=EINVAL */ + errno = 0; + rc = (int) fseek(fp, -1, SEEK_SET); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d fseek(-1) to invalid offset w/ SEEK_SET fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* ftell after invalid fseek should return last offset */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d ftell after fseek(-1) to invalid offset w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek to valid offset with SEEK_SET succeeds */ + errno = 0; + rc = (int) fseek(fp, 7, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(7) to valid offset w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after valid fseek with SEEK_SET */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 7 && err == 0, + "%s:%d ftell after fseek(7) w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek beyond end of file with SEEK_SET succeeds */ + errno = 0; + rc = (int) fseek(fp, 25, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(25) past EOF w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek beyond end of file with SEEK_SET */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 25 && err == 0, + "%s:%d ftell after fseek(25) w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek to beginning of file with SEEK_SET succeeds */ + errno = 0; + rc = (int) fseek(fp, 0, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(0) w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek to beginning of file with SEEK_SET */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d ftell after fseek(0) w/ SEEK_SET: %s", + __FILE__, __LINE__, strerror(err)); + + /*--- fseek() with SEEK_CUR tests ---*/ + + /* fseek to end of file with SEEK_CUR succeeds */ + errno = 0; + rc = (int) fseek(fp, 12, SEEK_CUR); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(12) to EOF w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek to end of file with SEEK_CUR */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d ftell after fseek(12) w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek to negative offset with SEEK_CUR should fail with errno=EINVAL */ + errno = 0; + rc = (int) fseek(fp, -15, SEEK_CUR); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d fseek(-15) to invalid offset w/ SEEK_CUR fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* ftell after fseek to negative offset with SEEK_CUR */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d ftell after fseek(-15) to invalid offset w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek to beginning of file with SEEK_CUR succeeds */ + errno = 0; + rc = (int) fseek(fp, -12, SEEK_CUR); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(-12) to beginning of file w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek to beginning of file with SEEK_CUR */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d ftell after fseek(-12) to beginning of file w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek beyond end of file with SEEK_CUR succeeds */ + errno = 0; + rc = (int) fseek(fp, 25, SEEK_CUR); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(25) past EOF w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek beyond end of file with SEEK_CUR */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 25 && err == 0, + "%s:%d ftell after fseek(25) past EOF w/ SEEK_CUR: %s", + __FILE__, __LINE__, strerror(err)); + + /*--- rewind test ---*/ + + /* ftell after rewind reports beginning of file */ + rewind(fp); + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d ftell after rewind reports beginning of file: %s", + __FILE__, __LINE__, strerror(err)); + + /*--- fseek() with SEEK_END tests ---*/ + + /* fseek to negative offset with SEEK_END should fail with errno=EINVAL */ + errno = 0; + rc = (int) fseek(fp, -15, SEEK_END); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d fseek(-15) to invalid offset w/ SEEK_END fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* ftell after fseek to negative offset with SEEK_END */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d ftell after fseek(-15) to negative offset w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek back one from end of file with SEEK_END succeeds */ + errno = 0; + rc = (int) fseek(fp, -1, SEEK_END); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(-1) from EOF w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek back one from end of file with SEEK_END */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 11 && err == 0, + "%s:%d ftell after fseek(-1) from end w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek to beginning of file with SEEK_END succeeds */ + errno = 0; + rc = (int) fseek(fp, -12, SEEK_END); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(-12) to beginning of file w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek to beginning of file with SEEK_END */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d ftell after fseek(-12) to beginning of file w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + /* fseek beyond end of file with SEEK_END succeeds */ + errno = 0; + rc = (int) fseek(fp, 25, SEEK_END); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(25) past EOF w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + /* ftell after fseek beyond end of file with SEEK_END */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == 37 && err == 0, + "%s:%d ftell after fseek(25) past EOF w/ SEEK_END: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = fclose(fp); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fclose(): %s", + __FILE__, __LINE__, strerror(err)); + + /*--- non-open file stream tests ---*/ + + /* fseek in non-open file stream should fail with errno=EBADF */ + errno = 0; + rc = (int) fseek(fp, 0, SEEK_SET); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d fseek in non-open file stream fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* ftell on non-open file stream should fail with errno=EBADF */ + errno = 0; + rc = (int) ftell(fp); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d ftell on non-open file stream fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* rewind on non-open file stream should fail with errno=EBADF */ + errno = 0; + rewind(fp); + err = errno; + ok(err == EBADF, + "%s:%d rewind on non-open file stream fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + diag("Finished UNIFYFS_WRAP(fseek/ftell/rewind) tests"); + + return 0; +} diff --git a/t/std/fwrite-fread.c b/t/std/fwrite-fread.c new file mode 100644 index 000000000..4da434e73 --- /dev/null +++ b/t/std/fwrite-fread.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test fwrite/fread/fgets/rewind/feof/chmod + */ + +#include +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +int fwrite_fread_test(char* unifyfs_root) +{ + diag("Starting UNIFYFS_WRAP(fwrite/fread/fgets/feof) tests"); + + char path[64]; + char buf[64] = {0}; + char* tmp; + FILE* fp = NULL; + int fd = 0; + int err, rc; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + skip(1, 4, "causing hang on some architectures. Try after future update"); + /* fwrite to bad FILE stream should segfault */ + dies_ok({ FILE* p = fopen("/tmp/fakefile", "r"); + fwrite("hello world", 12, 1, p); }, + "%s:%d fwrite() to bad FILE stream segfaults", + __FILE__, __LINE__); + + /* fread from bad FILE stream should segfault */ + dies_ok({ size_t rc = fread(buf, 15, 1, fp); ok(rc > 0); }, + "%s:%d fread() from bad FILE stream segfaults", + __FILE__, __LINE__); + + /* fgets from bad FILE stream segfaults */ + memset(buf, 0, sizeof(buf)); + dies_ok({ char* tmp = fgets(buf, 15, fp); ok(tmp != NULL); }, + "%s:%d fgets() from bad FILE stream segfaults", + __FILE__, __LINE__); + + dies_ok({ int rc = feof(fp); ok(rc > 0); }, + "%s:%d feof() on bad FILE stream segfaults", + __FILE__, __LINE__); + end_skip; + + /* Write "hello world" to a file */ + errno = 0; + fp = fopen(path, "w"); + err = errno; + ok(fp != NULL && err == 0, "%s:%d fopen(%s): %s", + __FILE__, __LINE__, path, strerror(err)); + + errno = 0; + rc = (int) fwrite("hello world", 12, 1, fp); + err = errno; + ok(rc == 1 && err == 0, + "%s:%d fwrite(\"hello world\") to file %s: %s", + __FILE__, __LINE__, path, strerror(err)); + + /* Seek to offset and overwrite "world" with "universe" */ + errno = 0; + rc = fseek(fp, 6, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(6) to \"world\": %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) fwrite("universe", 9, 1, fp); + err = errno; + ok(rc == 1 && err == 0, + "%s:%d overwrite \"world\" at offset 6 with \"universe\" : %s", + __FILE__, __LINE__, strerror(err)); + + /* fread from file open as write-only should fail with errno=EBADF */ + errno = 0; + rc = fseek(fp, 0, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fseek(0): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) fread(buf, 15, 1, fp); + err = errno; + ok(rc == 0 && err == EBADF, + "%s:%d fread() from file open as write-only should fail (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = fclose(fp); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fclose(): %s", + __FILE__, __LINE__, strerror(err)); + + /* fsync() tests */ + /* fsync on bad file descriptor should fail with errno=EINVAL */ + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d fsync() on bad file descriptor should fail (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* Sync extents */ + errno = 0; + fd = open(path, O_RDWR); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open() (fd=%d): %s", + __FILE__, __LINE__, fd, strerror(err)); + + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fsync(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close(): %s", + __FILE__, __LINE__, strerror(err)); + + /* fsync on non-open file should fail with errno=EBADF */ + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d fsync() on non-open file should fail (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* Laminate */ + errno = 0; + rc = chmod(path, 0444); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chmod(0444): %s", + __FILE__, __LINE__, strerror(err)); + + /* fopen a laminated file for write should fail with errno=EROFS */ + errno = 0; + fp = fopen(path, "w"); + err = errno; + ok(fp == NULL && err == EROFS, + "%s:%d fopen laminated file for write fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* fread() tests */ + errno = 0; + fp = fopen(path, "r"); + err = errno; + ok(fp != NULL && err == 0, "%s:%d fopen(%s): %s", + __FILE__, __LINE__, path, strerror(err)); + + /* fwrite to file open as read-only should fail with errno=EBADF */ + errno = 0; + rc = (int) fwrite("hello world", 12, 1, fp); + err = errno; + ok(rc == 0 && err == EBADF, + "%s:%d fwrite() to file open for read-only fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = (int) fread(buf, 15, 1, fp); + err = errno; + ok(rc == 1 && err == 0, + "%s:%d fread() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + is(buf, "hello universe", "%s:%d fread() saw \"hello universe\"", + __FILE__, __LINE__); + + errno = 0; + rc = fseek(fp, 6, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d fseek(6): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) fread(buf, 9, 1, fp); + err = errno; + ok(rc == 1 && err == 0, + "%s:%d fread() at offset 6 buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + is(buf, "universe", "%s:%d fread() saw \"universe\"", __FILE__, __LINE__); + + rewind(fp); + errno = 0; + rc = (int) fread(buf, 15, 1, fp); + err = errno; + ok(rc == 1 && err == 0, + "%s:%d fread() after rewind() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + is(buf, "hello universe", "%s:%d fread() saw \"hello universe\"", + __FILE__, __LINE__); + + /* fgets() tests */ + rewind(fp); + memset(buf, 0, sizeof(buf)); + errno = 0; + tmp = fgets(buf, 15, fp); + err = errno; + ok(tmp == buf && err == 0, + "%s:%d fgets() after rewind() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + is(buf, "hello universe", "%s:%d fgets() saw \"hello universe\"", + __FILE__, __LINE__); + + rewind(fp); + memset(buf, 0, sizeof(buf)); + errno = 0; + tmp = fgets(buf, 6, fp); + err = errno; + ok(tmp == buf && err == 0, + "%s:%d fgets() w/ size = 6 after rewind() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + is(buf, "hello", "%s:%d fgets() saw \"hello\"", __FILE__, __LINE__); + + rewind(fp); + errno = 0; + rc = (int) fread(buf, sizeof(buf), 1, fp); + err = errno; + ok(rc != 1 && err == 0, + "%s:%d fread() EOF: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = feof(fp); + err = errno; + ok(rc != 0 && err == 0, "%s:%d feof() past EOF: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = fclose(fp); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fclose(): %s", + __FILE__, __LINE__, strerror(err)); + + /* fwrite to closed stream fails with errno=EBADF */ + errno = 0; + rc = (int) fwrite("hello world", 12, 1, fp); + err = errno; + ok(rc == 0 && err == EBADF, + "%s:%d fwrite() to closed stream fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* fread from closed stream fails with errno=EBADF */ + errno = 0; + rc = (int) fread(buf, 15, 1, fp); + err = errno; + ok(rc == 0 && err == EBADF, + "%s:%d fread() from closed stream fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* fgets from closed stream fails with errno=EBADF */ + memset(buf, 0, sizeof(buf)); + errno = 0; + tmp = fgets(buf, 15, fp); + err = errno; + ok(err == EBADF, + "%s:%d fgets() from closed stream fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = feof(fp); + err = errno; + ok(rc != 0 && err == 0, + "%s:%d feof() on closed stream: %s", + __FILE__, __LINE__, strerror(err)); + + diag("Finished UNIFYFS_WRAP(fwrite/fread/fgets/feof) tests"); + + return 0; +} diff --git a/t/std/size.c b/t/std/size.c new file mode 100644 index 000000000..03faf5fe5 --- /dev/null +++ b/t/std/size.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +/* + * Test correctness of global file size. Also, test opening a file for append + */ + +int size_test(char* unifyfs_root) +{ + diag("Starting file size and fwrite/fread with append tests"); + + char path[64]; + char buf[64] = {0}; + FILE* fp = NULL; + char* tmp; + size_t global; + int fd; + + errno = 0; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Write "hello world" to a file */ + fp = fopen(path, "w"); + ok(fp != NULL, "%s:%d fopen(%s): %s", + __FILE__, __LINE__, path, strerror(errno)); + ok(fwrite("hello world", 12, 1, fp) == 1, + "%s:%d fwrite(\"hello world\": %s", __FILE__, __LINE__, strerror(errno)); + ok(fclose(fp) == 0, "%s:%d fclose(): %s", + __FILE__, __LINE__, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 12, "%s:%d global size after fwrite(\"hello world\") = %d: %s", + __FILE__, __LINE__, global, strerror(errno)); + + /* Open the file again with append, write to it. */ + fp = fopen(path, "a"); + ok(fp != NULL, "%s:%d fopen(%s) in append mode: %s", + __FILE__, __LINE__, path, strerror(errno)); + ok(fwrite("HELLO WORLD", 12, 1, fp) == 1, + "%s:%d fwrite(\"HELLO WORLD\") with file %s open for append: %s", + __FILE__, __LINE__, path, strerror(errno)); + + ok(ftell(fp) == 24, "%s:%d ftell() after appending to file: %s", + __FILE__, __LINE__, strerror(errno)); + + /* + * Set our position to somewhere in the middle of the file. Since the file + * is in append mode, this new position should be ignored, and writes + * should still go to the end of the file. + */ + ok(fseek(fp, 11, SEEK_SET) == 0, "%s:%d fseek(11) before append: %s", + __FILE__, __LINE__, strerror(errno)); + ok(fwrite("", 6, 1, fp) == 1, + "%s:%d fwrite(\"\") to append after seek to middle of file: %s", + __FILE__, __LINE__, strerror(errno)); + + ok(ftell(fp) == 30, "%s:%d ftell() after seek and appending to file: %s", + __FILE__, __LINE__, strerror(errno)); + + ok(fclose(fp) == 0, "%s:%d fclose(): %s", + __FILE__, __LINE__, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 30, "%s:%d global size after append is %d: %s", + __FILE__, __LINE__, global, strerror(errno)); + + /* Sync extents */ + fd = open(path, O_RDWR); + ok(fd >= 0, "%s:%d open file for fsync: %s", + __FILE__, __LINE__, strerror(errno)); + ok(fsync(fd) == 0, "%s:%d fsync(): %s", + __FILE__, __LINE__, strerror(errno)); + ok(close(fd) != -1, "%s:%d close after fsync: %s", + __FILE__, __LINE__, strerror(errno)); + + /* Laminate */ + ok(chmod(path, 0444) == 0, "%s:%d chmod(0444): %s", + __FILE__, __LINE__, strerror(errno)); + + /* Global size should be correct */ + testutil_get_size(path, &global); + ok(global == 30, "%s:%d global size after laminate is %d: %s", + __FILE__, __LINE__, global, strerror(errno)); + + /* Read it back */ + fp = fopen(path, "r"); + ok(fp != NULL, "%s%d: fopen(%s): %s", + __FILE__, __LINE__, path, strerror(errno)); + + memset(buf, 0, sizeof(buf)); + ok(fread(buf, 30, 1, fp) == 1, "%s:%d fread() buf[]=\"%s\", : %s", + __FILE__, __LINE__, buf, strerror(errno)); + + /* + * We wrote three strings to the file: "hello world" "HELLO WORLD" and + * "". Replace the '\0' after the first two strings with spaces + * so we can compare the file contents as one big string. + */ + buf[11] = ' '; /* after "hello world" */ + buf[23] = ' '; /* after "HELLO WORLD" */ + + is(buf, "hello world HELLO WORLD ", + "%s:%d saw \"hello world HELLO WORLD \"", __FILE__, __LINE__); + + /* Try seeking and reading at various positions */ + ok(fseek(fp, 6, SEEK_SET) == 0, "%s:%d fseek(6): %s", + __FILE__, __LINE__, strerror(errno)); + + ok(fread(buf, 6, 1, fp) == 1, "%s:%d fread() at offset 6 buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(errno)); + is(buf, "world", "%s:%d saw \"world\"", __FILE__, __LINE__); + + rewind(fp); + ok(fread(buf, 12, 1, fp) == 1, + "%s:%d fread() after rewind() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(errno)); + is(buf, "hello world", "%s:%d saw \"hello world\"", __FILE__, __LINE__); + + rewind(fp); + memset(buf, 0, sizeof(buf)); + tmp = fgets(buf, 12, fp); + ok(tmp == buf, "%s:%d fgets() after rewind() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(errno)); + is(buf, "hello world", "%s:%d saw \"hello world\"", __FILE__, __LINE__); + + rewind(fp); + memset(buf, 0, sizeof(buf)); + tmp = fgets(buf, 6, fp); + ok(tmp == buf, "%s:%d fgets() w/ size = 6 after rewind() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(errno)); + is(buf, "hello", "%s:%d saw \"hello\"", __FILE__, __LINE__); + + rewind(fp); + ok(fread(buf, sizeof(buf), 1, fp) != 1, + "%s:%d fread() past EOF: %s", __FILE__, __LINE__, strerror(errno)); + + ok(feof(fp) != 0, "%s:%d feof() past EOF: %s", + __FILE__, __LINE__, strerror(errno)); + + ok(fclose(fp) == 0, "%s:%d fclose(): %s", + __FILE__, __LINE__, strerror(errno)); + + diag("Finished file size and fwrite/fread with append tests"); + + return 0; +} diff --git a/t/std/stdio_suite.c b/t/std/stdio_suite.c index 2a1dd4e8c..df30e4c4e 100644 --- a/t/std/stdio_suite.c +++ b/t/std/stdio_suite.c @@ -71,6 +71,14 @@ int main(int argc, char* argv[]) fopen_fclose_test(unifyfs_root); + fseek_ftell_test(unifyfs_root); + + fwrite_fread_test(unifyfs_root); + + fflush_test(unifyfs_root); + + size_test(unifyfs_root); + MPI_Finalize(); done_testing(); diff --git a/t/std/stdio_suite.h b/t/std/stdio_suite.h index df5033a7b..9257f5f99 100644 --- a/t/std/stdio_suite.h +++ b/t/std/stdio_suite.h @@ -33,4 +33,16 @@ /* Tests for UNIFYFS_WRAP(fopen) and UNIFYFS_WRAP(fclose) */ int fopen_fclose_test(char* unifyfs_root); +/* Tests for UNIFYFS_WRAP(fseek/ftell/rewind) */ +int fseek_ftell_test(char* unifyfs_root); + +/* Tests for UNIFYFS_WRAP(fwrite) and UNIFYFS_WRAP(fread) */ +int fwrite_fread_test(char* unifyfs_root); + +/* Tests for UNIFYFS_WRAP(fflush) */ +int fflush_test(char* unifyfs_root); + +/* Tests for UNIFYFS_WRAP(size) */ +int size_test(char* unifyfs_root); + #endif /* STDIO_SUITE_H */ diff --git a/t/sys/chdir.c b/t/sys/chdir.c new file mode 100644 index 000000000..67600e9fa --- /dev/null +++ b/t/sys/chdir.c @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test chdir/fchdir/getcwd/getwd/get_current_dir_name + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +int chdir_test(char* unifyfs_root) +{ + diag("Starting UNIFYFS_WRAP(chdir/fchdir/getcwd/getwd/" + "get_current_dir_name) tests"); + + char path[64]; + int err, rc; + char* str; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* define a dir1 subdirectory within unifyfs space */ + char buf[64] = {0}; + snprintf(buf, sizeof(buf), "%s/dir1", unifyfs_root); + errno = 0; + rc = mkdir(buf, 0700); + err = errno; + ok(rc == 0 || err == EEXIST, "%s:%d mkdir(%s): %s", + __FILE__, __LINE__, buf, strerror(err)); + + /* define a dir2 subdirectory within unifyfs space */ + char buf2[64] = {0}; + snprintf(buf2, sizeof(buf2), "%s/dir2", unifyfs_root); + errno = 0; + rc = mkdir(buf2, 0700); + err = errno; + ok(rc == 0 || err == EEXIST, "%s:%d mkdir(%s): %s", + __FILE__, __LINE__, buf2, strerror(err)); + + /* change to root directory */ + errno = 0; + rc = chdir("/"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "/", strerror(err)); + + /* check that we're in root directory */ + errno = 0; + str = getcwd(path, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, "/", + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, "/"); + + /* change to unifyfs root directory */ + errno = 0; + rc = chdir(unifyfs_root); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, unifyfs_root, strerror(err)); + + /* check that we're in unifyfs root directory */ + errno = 0; + str = getcwd(path, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, unifyfs_root, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, unifyfs_root); + + /* get length of current directory */ + size_t len = strlen(str); + + /* try getcwd with a buffer short by one byte, + * should fail with ERANGE */ + errno = 0; + str = getcwd(path, len); // pass + err = errno; + ok(str == NULL && err == ERANGE, + "%s:%d getcwd(buf, short_len): (errno=%d) %s", + __FILE__, __LINE__, err, strerror(err)); + + /* try getcwd with a NULL buffer (Linux glibc extension to POSIX) */ + errno = 0; + str = getcwd(NULL, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd(NULL, good_len): (errno=%d) %s", + __FILE__, __LINE__, err, strerror(err)); + if (str != NULL) { + free(str); + } + + /* try getcwd with a NULL buffer but short size, + * should fail with ERANGE */ + errno = 0; + str = getcwd(NULL, len); + err = errno; + ok(str == NULL && err == ERANGE, + "%s:%d getcwd(NULL, short_len): (errno=%d) %s", + __FILE__, __LINE__, err, strerror(err)); + if (str != NULL) { + free(str); + } + + /* try getcwd with a buffer and 0 size, should fail with EINVAL */ + errno = 0; + str = getcwd(path, 0); + err = errno; + ok(str == NULL && err == EINVAL, "%s:%d getcwd(buf, 0): (errno=%d) %s", + __FILE__, __LINE__, err, strerror(err)); + if (str != NULL) { + free(str); + } + + /* try getcwd with a NULL buffer and 0 size, + * getcwd should allocate buffer (Linux glibc extension to POSIX) */ + errno = 0; + str = getcwd(NULL, 0); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd(NULL, 0): (errno=%d) %s", + __FILE__, __LINE__, err, strerror(err)); + if (str != NULL) { + free(str); + } + + /* change to unifyfs/dir1 */ + errno = 0; + rc = chdir(buf); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, buf, strerror(err)); + + /* check that we're in unifyfs/dir1 */ + errno = 0; + str = getcwd(path, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, buf, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, buf); + + /* change back to root unifyfs directory */ + errno = 0; + rc = chdir(".."); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "..", strerror(err)); + + /* check that we're in root unifyfs directory */ + errno = 0; + str = getcwd(path, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, unifyfs_root, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, unifyfs_root); + + /* change to unifyfs/dir1 directory using relative path */ + errno = 0; + rc = chdir("dir1"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "dir1", strerror(err)); + + /* check that we're in unifyfs/dir1 directory */ + errno = 0; + str = getcwd(path, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, buf, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, buf); + + /* change to unifyfs/dir2 directory in strange way */ + errno = 0; + rc = chdir("././.././dir2"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "dir1", strerror(err)); + + /* check that we're in unifyfs/dir2 directory */ + errno = 0; + str = getcwd(path, sizeof(path)); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, buf2, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, buf2); + + +/* TODO: Some compilers throw a warning/error if one uses getwd(). + * For those compilers that allow it, it would be nice to execute + * these tests. For now, I'll leave this here as a reminder. */ +#if 0 + /* change to root directory */ + errno = 0; + rc = chdir("/"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "/", strerror(err)); + + /* check that we're in root directory */ + errno = 0; + str = getwd(pathmax); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, "/", + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, "/"); + + /* change to unifyfs root directory */ + errno = 0; + rc = chdir(unifyfs_root); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, unifyfs_root, strerror(err)); + + /* check that we're in unifyfs root directory */ + errno = 0; + str = getwd(pathmax); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, unifyfs_root, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, unifyfs_root); + + /* change to directory within unifyfs */ + errno = 0; + rc = chdir(buf); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, buf, strerror(err)); + + /* check that we're in directory within unifyfs */ + errno = 0; + str = getwd(pathmax); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, buf, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, buf); + + /* change back to root unifyfs directory */ + errno = 0; + rc = chdir(".."); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "..", strerror(err)); + + /* check that we're in root unifyfs directory */ + errno = 0; + str = getwd(pathmax); + err = errno; + ok(str != NULL && err == 0, "%s:%d getcwd: %s", + __FILE__, __LINE__, strerror(err)); + is(str, unifyfs_root, + "%s:%d getcwd returned %s expected %s", + __FILE__, __LINE__, str, unifyfs_root); +#endif + + /* change to root directory */ + errno = 0; + rc = chdir("/"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "/", strerror(err)); + + /* check that we're in root directory */ + errno = 0; + str = get_current_dir_name(); + err = errno; + ok(str != NULL && err == 0, "%s:%d get_current_dir_name: %s", + __FILE__, __LINE__, strerror(err)); + is(str, "/", + "%s:%d get_current_dir_name returned %s expected %s", + __FILE__, __LINE__, str, "/"); + if (str != NULL) { + free(str); + } + + /* change to unifyfs root directory */ + errno = 0; + rc = chdir(unifyfs_root); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, unifyfs_root, strerror(err)); + + /* check that we're in unifyfs root directory */ + errno = 0; + str = get_current_dir_name(); + err = errno; + ok(str != NULL && err == 0, "%s:%d get_current_dir_name: %s", + __FILE__, __LINE__, strerror(err)); + is(str, unifyfs_root, + "%s:%d get_current_dir_name returned %s expected %s", + __FILE__, __LINE__, str, unifyfs_root); + if (str != NULL) { + free(str); + } + + /* change to unifyfs/dir1 */ + errno = 0; + rc = chdir(buf); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, buf, strerror(err)); + + /* check that we're in unifyfs/dir1 */ + errno = 0; + str = get_current_dir_name(); + err = errno; + ok(str != NULL && err == 0, "%s:%d get_current_dir_name: %s", + __FILE__, __LINE__, strerror(err)); + is(str, buf, + "%s:%d get_current_dir_name returned %s expected %s", + __FILE__, __LINE__, str, buf); + if (str != NULL) { + free(str); + } + + /* change back to root unifyfs directory */ + errno = 0; + rc = chdir(".."); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "..", strerror(err)); + + /* check that we're in root unifyfs directory */ + errno = 0; + str = get_current_dir_name(); + err = errno; + ok(str != NULL && err == 0, "%s:%d get_current_dir_name: %s", + __FILE__, __LINE__, strerror(err)); + is(str, unifyfs_root, + "%s:%d get_current_dir_name returned %s expected %s", + __FILE__, __LINE__, str, unifyfs_root); + if (str != NULL) { + free(str); + } + + /* change to unifyfs/dir2 directory */ + errno = 0; + rc = chdir("dir2"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "dir2", strerror(err)); + + /* check that we're in unifyfs/dir2 */ + errno = 0; + str = get_current_dir_name(); + err = errno; + ok(str != NULL && err == 0, "%s:%d get_current_dir_name: %s", + __FILE__, __LINE__, strerror(err)); + is(str, buf2, + "%s:%d get_current_dir_name returned %s expected %s", + __FILE__, __LINE__, str, buf2); + if (str != NULL) { + free(str); + } + + +/* TODO: Our directory wrappers are not fully functioning yet, + * but when they do, we should check that fchdir works. */ + skip(1, 7, "fchdir tests until directory wrappers are fully functional") + /* change to root directory */ + errno = 0; + rc = chdir("/"); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chdir(%s): %s", + __FILE__, __LINE__, "/", strerror(err)); + + /* open a directory in unifyfs */ + errno = 0; + DIR* dirp = opendir(buf); + err = errno; + ok(dirp != NULL && err == 0, "%s:%d opendir(%s): %s", + __FILE__, __LINE__, buf, strerror(err)); + + errno = 0; + int fd = dirfd(dirp); + err = errno; + ok(fd >= 0 && err == 0, "%s:%d dirfd(%p): %s", + __FILE__, __LINE__, dirp, strerror(err)); + + /* use fchdir to change into it */ + errno = 0; + rc = fchdir(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fchdir(%d): %s", + __FILE__, __LINE__, fd, strerror(err)); + + closedir(dirp); + + /* open root directory */ + errno = 0; + dirp = opendir("/"); + err = errno; + ok(dirp != NULL && err == 0, "%s:%d opendir(%s): %s", + __FILE__, __LINE__, "/", strerror(err)); + + errno = 0; + fd = dirfd(dirp); + err = errno; + ok(fd >= 0 && err == 0, "%s:%d dirfd(%p): %s", + __FILE__, __LINE__, dirp, strerror(err)); + + /* use fchdir to change into it */ + errno = 0; + rc = fchdir(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fchdir(%d): %s", + __FILE__, __LINE__, fd, strerror(err)); + + closedir(dirp); + end_skip; + + return 0; +} diff --git a/t/sys/creat-close.c b/t/sys/creat-close.c index 16367036b..f7c92d556 100644 --- a/t/sys/creat-close.c +++ b/t/sys/creat-close.c @@ -34,7 +34,7 @@ int creat_close_test(char* unifyfs_root) char path[64]; int mode = 0600; int fd = -1; - int rc = -1; + int err, rc; /* Create a random file name at the mountpoint path to test on */ testutil_rand_path(path, sizeof(path), unifyfs_root); @@ -42,38 +42,48 @@ int creat_close_test(char* unifyfs_root) /* Verify closing a non-existent file fails with errno=EBADF */ errno = 0; rc = close(fd); - ok(rc < 0 && errno == EBADF, - "close non-existing file %s should fail (rc=%d, errno=%d): %s", - path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d close non-existing file %s should fail (errno=%d): %s", + __FILE__, __LINE__, path, err, strerror(err)); /* Verify we can create a non-existent file. */ errno = 0; fd = creat(path, mode); - ok(fd >= 0, "creat non-existing file %s (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "%s:%d creat non-existing file %s (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); /* Verify close succeeds. */ errno = 0; rc = close(fd); - ok(rc == 0, "close new file %s (rc=%d): %s", path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close new file: %s", + __FILE__, __LINE__, strerror(err)); /* Verify creating an already created file succeeds. */ errno = 0; fd = creat(path, mode); - ok(fd >= 0, "creat existing file %s (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "%s:%d creat existing file %s (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); /* Verify close succeeds. */ errno = 0; rc = close(fd); - ok(rc == 0, "close %s (rc=%d): %s", path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close %s: %s", + __FILE__, __LINE__, path, strerror(err)); /* Verify closing already closed file fails with errno=EBADF */ errno = 0; rc = close(fd); - ok(rc < 0 && errno == EBADF, - "close already closed file %s should fail (rc=%d, errno=%d): %s", - path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d close already closed file %s should fail (errno=%d): %s", + __FILE__, __LINE__, path, err, strerror(err)); /* CLEANUP * diff --git a/t/sys/creat64.c b/t/sys/creat64.c index f2a6ab126..4b4495c6c 100644 --- a/t/sys/creat64.c +++ b/t/sys/creat64.c @@ -34,29 +34,30 @@ int creat64_test(char* unifyfs_root) char path[64]; int mode = 0600; - int fd; - int rc; + int err, fd; /* Create a random file name at the mountpoint path to test on */ testutil_rand_path(path, sizeof(path), unifyfs_root); - skip(1, 2, "remove when UNIFYFS(create64) has been implemented"); /* Verify we can create a non-existent file. */ errno = 0; fd = creat64(path, mode); - ok(fd >= 0, "creat64 non-existing file %s (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "creat64 non-existing file %s (fd=%d): %s", + path, fd, strerror(err)); - rc = close(fd); + ok(close(fd) != -1, "close() worked"); /* Verify creating an already created file succeeds. */ errno = 0; fd = creat64(path, mode); - ok(fd >= 0, "creat64 existing file %s (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "creat64 existing file %s (fd=%d): %s", + path, fd, strerror(err)); - rc = close(fd); - end_skip; + ok(close(fd) != -1, "close() worked"); diag("Finished UNIFYFS_WRAP(creat64) tests"); diff --git a/t/sys/lseek.c b/t/sys/lseek.c new file mode 100644 index 000000000..8499412c6 --- /dev/null +++ b/t/sys/lseek.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2018, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +/* This function contains the tests for UNIFYFS_WRAP(lseek) found in + * client/src/unifyfs-sysio.c. + * + * Notice the tests are ordered in a logical testing order. Changing the order + * or adding new tests in between two others could negatively affect the + * desired results. */ +int lseek_test(char* unifyfs_root) +{ + /* Diagnostic message for reading and debugging output */ + diag("Starting UNIFYFS_WRAP(lseek) tests"); + + char path[64]; + int file_mode = 0600; + int fd = -1; + int err, rc; + + /* Create a random file at the mountpoint path to test on */ + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* lseek in bad file descriptor should fail with errno=EBADF */ + errno = 0; + rc = (int) lseek(fd, 0, SEEK_SET); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d lseek in bad file descriptor fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* Open a file and write to it to test lseek() */ + errno = 0; + fd = open(path, O_RDWR | O_CREAT | O_TRUNC, file_mode); + err = errno; + ok(fd >= 0 && err == 0, "%s:%d open worked: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d write worked: %s", __FILE__, __LINE__, strerror(err)); + + /* lseek with invalid whence fails with errno=EINVAL. */ + errno = 0; + rc = (int) lseek(fd, 0, -1); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d lseek with invalid whence should fail (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /*--- lseek() with SEEK_SET tests ---*/ + + /* lseek to negative offset with SEEK_SET should fail with errno=EINVAL */ + errno = 0; + rc = (int) lseek(fd, -1, SEEK_SET); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d lseek(-1, SEEK_SET) to invalid offset fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* lseek to valid offset with SEEK_SET succeeds */ + errno = 0; + rc = (int) lseek(fd, 7, SEEK_SET); + err = errno; + ok(rc == 7 && err == 0, + "%s:%d lseek(7, SEEK_SET) to valid offset: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek beyond end of file with SEEK_SET succeeds */ + errno = 0; + rc = (int) lseek(fd, 25, SEEK_SET); + err = errno; + ok(rc == 25 && err == 0, + "%s:%d lseek(25, SEEK_SET) beyond EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek to beginning of file with SEEK_SET succeeds */ + errno = 0; + rc = (int) lseek(fd, 0, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lseek(0, SEEK_SET): %s", + __FILE__, __LINE__, strerror(err)); + + /*--- lseek() with SEEK_CUR tests ---*/ + + /* lseek to end of file with SEEK_CUR succeeds */ + errno = 0; + rc = (int) lseek(fd, 12, SEEK_CUR); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d lseek(12, SEEK_CUR) to EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek to negative offset with SEEK_CUR should fail with errno=EINVAL */ + errno = 0; + rc = (int) lseek(fd, -15, SEEK_CUR); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d lseek(-15, SEEK_CUR) to invalid offset fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* lseek to beginning of file with SEEK_CUR succeeds */ + errno = 0; + rc = (int) lseek(fd, -12, SEEK_CUR); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lseek(-12, SEEK_CUR) to beginning of file: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek beyond end of file with SEEK_CUR succeeds */ + errno = 0; + rc = (int) lseek(fd, 25, SEEK_CUR); + err = errno; + ok(rc == 25 && err == 0, + "%s:%d lseek(25, SEEK_CUR) beyond EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /*--- lseek() with SEEK_END tests ---*/ + + /* lseek to negative offset with SEEK_END should fail with errno=EINVAL */ + errno = 0; + rc = (int) lseek(fd, -15, SEEK_END); + err = errno; + ok(rc == -1 && err == EINVAL, + "%s:%d lseek(-15, SEEK_END) to invalid offset fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* lseek back one from end of file with SEEK_END succeeds */ + errno = 0; + rc = (int) lseek(fd, -1, SEEK_END); + err = errno; + ok(rc == 11 && err == 0, + "%s:%d lseek(-1, SEEK_END) from EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek to beginning of file with SEEK_END succeeds */ + errno = 0; + rc = (int) lseek(fd, -12, SEEK_END); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lseek(-12, SEEK_END) to beginning of file: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek beyond end of file with SEEK_END succeeds */ + errno = 0; + rc = (int) lseek(fd, 25, SEEK_END); + err = errno; + ok(rc == 37 && err == 0, + "%s:%d lseek(25, SEEK_END) beyond EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /*--- lseek() with SEEK_DATA tests ---*/ + + /* Write beyond end of file to create a hole */ + errno = 0; + rc = (int) write(fd, "hello universe", 15); + err = errno; + ok(rc == 15 && err == 0, + "%s:%d write to create hole: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek to negative offset with SEEK_DATA should fail with errno=ENXIO */ + errno = 0; + rc = (int) lseek(fd, -1, SEEK_DATA); + err = errno; + ok(rc == -1 && err == ENXIO, + "%s:%d lseek(-1, SEEK_DATA) to invalid offset fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* lseek to beginning of file with SEEK_DATA succeeds */ + errno = 0; + rc = (int) lseek(fd, 0, SEEK_DATA); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lseek(0, SEEK_DATA) w/ SEEK_DATA: %s", + __FILE__, __LINE__, strerror(err)); + + /* Fallback implementation: lseek to data after hole with SEEK_DATA returns + * current offset */ + errno = 0; + rc = (int) lseek(fd, 15, SEEK_DATA); + err = errno; + ok(rc == 15 && err == 0, + "%s:%d lseek(15, SEEK_DATA) to data after hole returns offset: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek beyond end of file with SEEK_DATA should fail with errno=ENXIO */ + errno = 0; + rc = (int) lseek(fd, 75, SEEK_DATA); + err = errno; + ok(rc == -1 && err == ENXIO, + "%s:%d lseek(75, SEEK_DATA) beyond EOF fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /*--- lseek() with SEEK_HOLE tests ---*/ + + /* lseek to negative offset with SEEK_HOLE should fail with errno=ENXIO */ + errno = 0; + rc = (int) lseek(fd, -1, SEEK_HOLE); + err = errno; + ok(rc == -1 && err == ENXIO, + "%s:%d lseek(-1, SEEK_HOLE) to invalid offset fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* Fallback implementation: lseek to first hole of file with SEEK_HOLE + * returns or EOF */ + errno = 0; + rc = (int) lseek(fd, 0, SEEK_HOLE); + err = errno; + ok(rc == 52 && err == 0, + "%s:%d lseek(0, SEEK_HOLE) to first hole in file returns EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /* Fallback implementation: lseek to middle of hole with SEEK_HOLE returns + * EOF */ + errno = 0; + rc = (int) lseek(fd, 18, SEEK_HOLE); + err = errno; + ok(rc == 52 && err == 0, + "%s:%d lseek(18, SEEK_HOLE) to middle of hole returns EOF: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek to end of file with SEEK_HOLE succeeds */ + errno = 0; + rc = (int) lseek(fd, 42, SEEK_HOLE); + err = errno; + ok(rc == 52 && err == 0, + "%s:%d lseek(42, SEEK_HOLE) to EOF w/ SEEK_HOLE: %s", + __FILE__, __LINE__, strerror(err)); + + /* lseek beyond end of file with SEEK_HOLE should fail with errno= ENXIO */ + errno = 0; + rc = (int) lseek(fd, 75, SEEK_HOLE); + err = errno; + ok(rc == -1 && err == ENXIO, + "%s:%d lseek(75, SEEK_HOLE) beyond EOF fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + close(fd); + + /* lseek in non-open file descriptor should fail with errno=EBADF */ + errno = 0; + rc = (int) lseek(fd, 0, SEEK_SET); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d lseek in non-open file descriptor fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + diag("Finished UNIFYFS_WRAP(lseek) tests"); + + return 0; +} diff --git a/t/sys/mkdir-rmdir.c b/t/sys/mkdir-rmdir.c index e3ac88622..b276b9510 100644 --- a/t/sys/mkdir-rmdir.c +++ b/t/sys/mkdir-rmdir.c @@ -38,8 +38,7 @@ int mkdir_rmdir_test(char* unifyfs_root) char file_subdir_path[80]; int file_mode = 0600; int dir_mode = 0700; - int fd; - int rc; + int err, fd, rc; /* Create random dir and file path names at the mountpoint to test on */ testutil_rand_path(dir_path, sizeof(dir_path), unifyfs_root); @@ -64,115 +63,140 @@ int mkdir_rmdir_test(char* unifyfs_root) /* todo_mkdir_1: Remove when issue is resolved */ todo("mkdir_1: we currently don't support directory structure"); - /* Verify we cannot create a dir whose parent dir doesn't exist with + /* Verify creating a dir under non-existent parent dir fails with * errno=ENOENT */ errno = 0; rc = mkdir(subdir_path, dir_mode); - ok(rc < 0 && errno == ENOENT, - "mkdir dir %s without parent dir should fail (rc=%d, errno=%d): %s", - subdir_path, rc, errno, strerror(errno)); - + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d mkdir dir %s without parent dir should fail (errno=%d): %s", + __FILE__, __LINE__, subdir_path, err, strerror(err)); end_todo; /* end todo_mkdir_1 */ /* Verify rmdir a non-existing directory fails with errno=ENOENT */ errno = 0; rc = rmdir(dir_path); - ok(rc < 0 && errno == ENOENT, - "rmdir non-existing dir %s should fail (rc=%d, errno=%d): %s", - dir_path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d rmdir non-existing dir %s should fail (errno=%d): %s", + __FILE__, __LINE__, dir_path, err, strerror(err)); /* Verify we can create a non-existent directory. */ errno = 0; rc = mkdir(dir_path, dir_mode); - ok(rc == 0, "mkdir non-existing dir %s (rc=%d): %s", - dir_path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d mkdir non-existing dir %s: %s", + __FILE__, __LINE__, dir_path, strerror(err)); - /* Verify we cannot recreate an already created directory with - * errno=EEXIST */ + /* Verify recreating an already created directory fails with errno=EEXIST */ errno = 0; rc = mkdir(dir_path, dir_mode); - ok(rc < 0 && errno == EEXIST, - "mkdir existing dir %s should fail (rc=%d, errno=%d): %s", - dir_path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == EEXIST, + "%s:%d mkdir existing dir %s should fail (errno=%d): %s", + __FILE__, __LINE__, dir_path, err, strerror(err)); - /* todo_mkdir_2: Remove when issue is resolved */ - todo("mkdir_2: should fail with errno=EISDIR=21"); - /* Verify we cannot create a file with same name as a directory with - * errno=EISDIR */ + /* Verify creating a file with same name as a dir fails with errno=EISDIR */ errno = 0; fd = creat(dir_path, file_mode); - ok(fd < 0 && errno == EISDIR, - "creat file with same name as dir %s should fail (fd=%d, errno=%d): %s", - dir_path, fd, errno, strerror(errno)); - end_todo; /* end todo_mkdir_2 */ + err = errno; + ok(fd == -1 && err == EISDIR, + "%s:%d creat file with same name as dir %s fails (fd=%d, errno=%d): %s", + __FILE__, __LINE__, dir_path, fd, err, strerror(err)); /* todo_mkdir_3: Remove when issue is resolved */ todo("mkdir_3: this fails because \"TODO mkdir_1\" is failing"); /* Verify we can create a subdirectory under an existing directory */ errno = 0; rc = mkdir(subdir_path, dir_mode); - ok(rc == 0, "mkdir subdirectory %s in existing dir (rc=%d): %s", - subdir_path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d mkdir subdirectory %s in existing dir: %s", + __FILE__, __LINE__, subdir_path, strerror(err)); end_todo; /* end todo_mkdir_3 */ /* Verify we can create a subfile under an existing directory */ errno = 0; fd = creat(subfile_path, file_mode); - ok(fd > 0, "creat subfile %s in existing dir (fd=%d): %s", - subfile_path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "%s:%d creat subfile %s in existing dir (fd=%d): %s", + __FILE__, __LINE__, subfile_path, fd, strerror(err)); + errno = 0; rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); - /* todo_mkdir_4: Remove when issue is resolved */ - todo("mkdir_4: unifyfs currently creates all paths as separate entities"); /* Verify creating a directory whose parent is a file fails with * errno=ENOTDIR */ + errno = 0; fd = creat(file_path, file_mode); + err = errno; + ok(fd >= 0 && err == 0, + "%s:%d creat parent file %s (fd=%d): %s", + __FILE__, __LINE__, file_path, fd, strerror(err)); + + errno = 0; rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); + /* todo_mkdir_4: Remove when issue is resolved */ + todo("mkdir_4: unifyfs currently creates all paths as separate entities"); errno = 0; rc = mkdir(file_subdir_path, dir_mode); - ok(rc < 0 && errno == ENOTDIR, - "mkdir dir %s whose parent is a file should fail (rc=%d, errno=%d): %s", - file_subdir_path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == ENOTDIR, + "%s:%d mkdir dir %s where parent is a file should fail (errno=%d): %s", + __FILE__, __LINE__, file_subdir_path, err, strerror(err)); end_todo; /* end todo_mkdir_4 */ - /* Verify rmdir a non-directory fails with errno=ENOENT */ + /* Verify rmdir on non-directory fails with errno=ENOTDIR */ errno = 0; rc = rmdir(file_path); - ok(rc < 0 && errno == ENOTDIR, - "rmdir non-directory %s should fail (rc=%d, errno=%d): %s", - file_path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == ENOTDIR, + "%s:%d rmdir non-directory %s should fail (errno=%d): %s", + __FILE__, __LINE__, file_path, err, strerror(err)); /* todo_mkdir_5: Remove when issue is resolved */ todo("mkdir_5: unifyfs currently creates all paths as separate entities"); /* Verify rmdir a non-empty directory fails with errno=ENOTEMPTY */ errno = 0; rc = rmdir(dir_path); - ok(rc < 0 && errno == ENOTEMPTY, - "rmdir non-empty directory %s should fail (rc=%d, errno=%d): %s", - dir_path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == ENOTEMPTY, + "%s:%d rmdir non-empty directory %s should fail (errno=%d): %s", + __FILE__, __LINE__, dir_path, err, strerror(err)); end_todo; /* end todo_mkdir_5 */ /* Verify we can rmdir an empty directory */ errno = 0; rc = rmdir(subdir_path); - ok(rc == 0, "rmdir an empty directory %s (rc=%d): %s", - subdir_path, rc, strerror(errno)); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d rmdir an empty directory %s: %s", + __FILE__, __LINE__, subdir_path, strerror(err)); /* Verify rmdir an already removed directory fails with errno=ENOENT */ errno = 0; rc = rmdir(subdir_path); - ok(rc < 0 && errno == ENOENT, - "rmdir already removed dir %s should fail (rc=%d, errno=%d): %s", - subdir_path, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d rmdir already removed dir %s should fail (errno=%d): %s", + __FILE__, __LINE__, subdir_path, err, strerror(err)); /* Verify trying to rmdir the mount point fails with errno=EBUSY */ errno = 0; rc = rmdir(unifyfs_root); - ok(rc < 0 && errno == EBUSY, - "rmdir mount point %s should fail (rc=%d, errno=%d): %s", - unifyfs_root, rc, errno, strerror(errno)); + err = errno; + ok(rc == -1 && err == EBUSY, + "%s:%d rmdir mount point %s should fail (errno=%d): %s", + __FILE__, __LINE__, unifyfs_root, err, strerror(err)); /* CLEANUP * diff --git a/t/sys/open.c b/t/sys/open.c index d88a5d4a1..30489ffa6 100644 --- a/t/sys/open.c +++ b/t/sys/open.c @@ -36,8 +36,7 @@ int open_test(char* unifyfs_root) char dir_path[64]; int file_mode = 0600; int dir_mode = 0700; - int fd; - int rc; + int err, fd, rc; /* Create a random file and dir name at the mountpoint path to test on */ testutil_rand_path(path, sizeof(path), unifyfs_root); @@ -47,52 +46,59 @@ int open_test(char* unifyfs_root) * errno=ENOENT */ errno = 0; fd = open(path, O_RDWR, file_mode); - ok(fd < 0 && errno == ENOENT, + err = errno; + ok(fd < 0 && err == ENOENT, "open non-existing file %s w/out O_CREATE fails (fd=%d, errno=%d): %s", - path, fd, errno, strerror(errno)); + path, fd, err, strerror(err)); /* Verify we can create a new file. */ errno = 0; fd = open(path, O_CREAT|O_EXCL, file_mode); - ok(fd >= 0, "open non-existing file %s flags O_CREAT|O_EXCL (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "open non-existing file %s flags O_CREAT|O_EXCL (fd=%d): %s", + path, fd, strerror(err)); - rc = close(fd); + ok(close(fd) != -1, "close() worked"); /* Verify opening an existing file with O_CREAT|O_EXCL fails with * errno=EEXIST. */ errno = 0; fd = open(path, O_CREAT|O_EXCL, file_mode); - ok(fd < 0 && errno == EEXIST, + err = errno; + ok(fd < 0 && err == EEXIST, "open existing file %s O_CREAT|O_EXCL should fail (fd=%d, errno=%d): %s", - path, fd, errno, strerror(errno)); + path, fd, err, strerror(err)); /* Verify opening an existing file with O_RDWR succeeds. */ errno = 0; fd = open(path, O_RDWR, file_mode); - ok(fd >= 0, "open existing file %s O_RDWR (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "open existing file %s O_RDWR (fd=%d): %s", + path, fd, strerror(err)); - rc = close(fd); + ok(close(fd) != -1, "close() worked"); - /* todo_open_1: Remove when issue is resolved */ - todo("open_1: should fail with errno=EISDIR=21"); - /* Verify opening a dir for write fails with errno=EISDIR */ + errno = 0; rc = mkdir(dir_path, dir_mode); + err = errno; + ok(rc == 0 && err == 0, "mkdir(%s, %o) worked, (errno=%d): %s", + dir_path, dir_mode, err, strerror(err)); errno = 0; fd = open(dir_path, O_RDWR, file_mode); - ok(fd < 0 && errno == EISDIR, + err = errno; + ok(fd < 0 && err == EISDIR, "open directory %s for write should fail (fd=%d, errno=%d): %s", - dir_path, fd, errno, strerror(errno)); - end_todo; /* end todo_open_1 */ + dir_path, fd, err, strerror(err)); - /* ClEANUP + /* CLEANUP * * Don't unlink `path` so that the final test (9020-mountpoint-empty) can * check if open left anything in the mountpoint and thus wasn't wrapped * properly. */ - rc = rmdir(dir_path); + ok(rmdir(dir_path) != -1, "rmdir() worked"); diag("Finished UNIFYFS_WRAP(open) tests"); diff --git a/t/sys/open64.c b/t/sys/open64.c index f831e72ae..127a1f085 100644 --- a/t/sys/open64.c +++ b/t/sys/open64.c @@ -36,8 +36,7 @@ int open64_test(char* unifyfs_root) char path[64]; int mode = 0600; - int fd; - int rc; + int err, fd; /* Create a random file name at the mountpoint path to test on */ testutil_rand_path(path, sizeof(path), unifyfs_root); @@ -46,33 +45,39 @@ int open64_test(char* unifyfs_root) * errno=ENOENT */ errno = 0; fd = open64(path, O_RDWR, mode); - ok(fd < 0 && errno == ENOENT, + err = errno; + ok(fd < 0 && err == ENOENT, "open64 non-existing file %s w/out O_CREATE fails (fd=%d, errno=%d): %s", - path, fd, errno, strerror(errno)); + path, fd, err, strerror(err)); /* Verify we can create a new file. */ errno = 0; fd = open64(path, O_CREAT|O_EXCL, mode); - ok(fd >= 0, "open64 non-existing file %s flags O_CREAT|O_EXCL (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "open64 non-existing file %s flags O_CREAT|O_EXCL (fd=%d): %s", + path, fd, strerror(err)); - rc = close(fd); + ok(close(fd) != -1, "close() worked"); /* Verify opening an existing file with O_CREAT|O_EXCL fails with * errno=EEXIST. */ errno = 0; fd = open64(path, O_CREAT|O_EXCL, mode); - ok(fd < 0 && errno == EEXIST, + err = errno; + ok(fd < 0 && err == EEXIST, "open64 existing file %s O_CREAT|O_EXCL fails (fd=%d, errno=%d): %s", - path, fd, errno, strerror(errno)); + path, fd, err, strerror(err)); /* Verify opening an existing file with O_RDWR succeeds. */ errno = 0; fd = open64(path, O_RDWR, mode); - ok(fd >= 0, "open64 existing file %s O_RDWR (fd=%d): %s", - path, fd, strerror(errno)); + err = errno; + ok(fd >= 0 && err == 0, + "open64 existing file %s O_RDWR (fd=%d): %s", + path, fd, strerror(err)); - rc = close(fd); + ok(close(fd) != -1, "close() worked"); diag("Finished UNIFYFS_WRAP(open64) tests"); diff --git a/t/sys/sysio_suite.c b/t/sys/sysio_suite.c index e3db87859..63fa2acc2 100644 --- a/t/sys/sysio_suite.c +++ b/t/sys/sysio_suite.c @@ -81,6 +81,28 @@ int main(int argc, char* argv[]) open64_test(unifyfs_root); + lseek_test(unifyfs_root); + + write_read_test(unifyfs_root); + write_pre_existing_file_test(unifyfs_root); + + write_read_hole_test(unifyfs_root); + + truncate_test(unifyfs_root); + truncate_bigempty(unifyfs_root); + truncate_eof(unifyfs_root); + truncate_truncsync(unifyfs_root); + truncate_pattern_size(unifyfs_root, 0); + truncate_pattern_size(unifyfs_root, 2020); + truncate_empty_read(unifyfs_root, 0); + truncate_empty_read(unifyfs_root, 2020); + truncate_ftrunc_before_sync(unifyfs_root); + truncate_trunc_before_sync(unifyfs_root); + + unlink_test(unifyfs_root); + + chdir_test(unifyfs_root); + MPI_Finalize(); done_testing(); diff --git a/t/sys/sysio_suite.h b/t/sys/sysio_suite.h index 01f4e87f6..a24d7a9b5 100644 --- a/t/sys/sysio_suite.h +++ b/t/sys/sysio_suite.h @@ -45,4 +45,29 @@ int open_test(char* unifyfs_root); /* Tests for UNIFYFS_WRAP(open64) */ int open64_test(char* unifyfs_root); +/* Tests for UNIFYFS_WRAP(lseek) */ +int lseek_test(char* unifyfs_root); + +int write_read_test(char* unifyfs_root); + +int write_pre_existing_file_test(char* unifyfs_root); + +/* test reading from file with holes */ +int write_read_hole_test(char* unifyfs_root); + +/* Tests for UNIFYFS_WRAP(ftruncate) and UNIFYFS_WRAP(truncate) */ +int truncate_test(char* unifyfs_root); +int truncate_bigempty(char* unifyfs_root); +int truncate_eof(char* unifyfs_root); +int truncate_truncsync(char* unifyfs_root); +int truncate_pattern_size(char* unifyfs_root, int pos); +int truncate_empty_read(char* unifyfs_root, int pos); +int truncate_ftrunc_before_sync(char* unifyfs_root); +int truncate_trunc_before_sync(char* unifyfs_root); + +/* Test for UNIFYFS_WRAP(unlink) */ +int unlink_test(char* unifyfs_root); + +int chdir_test(char* unifyfs_root); + #endif /* SYSIO_SUITE_H */ diff --git a/t/sys/truncate.c b/t/sys/truncate.c new file mode 100644 index 000000000..0487032d8 --- /dev/null +++ b/t/sys/truncate.c @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test truncate and ftruncate + */ +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +int truncate_test(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* write 1MB and fsync, expect 1MB */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 1*bufsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 1*bufsize); + + /* skip a 1MB hole, write another 1MB, and fsync expect 3MB */ + rc = lseek(fd, 2*bufsize, SEEK_SET); + ok(rc == 2*bufsize, "%s:%d lseek(%d) (rc=%d): %s", + __FILE__, __LINE__, 2*bufsize, rc, strerror(errno)); + + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 3*bufsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 3*bufsize); + + /* ftruncate at 5MB, expect 5MB */ + rc = ftruncate(fd, 5*bufsize); + ok(rc == 0, "%s:%d ftruncate(%d) (rc=%d): %s", + __FILE__, __LINE__, 5*bufsize, rc, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 5*bufsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 5*bufsize); + + close(fd); + + /* truncate at 0.5 MB, expect 0.5MB */ + rc = truncate(path, bufsize/2); + ok(rc == 0, "%s:%d truncate(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize/2, rc, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == bufsize/2, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, bufsize/2); + + /* truncate to 0, expect 0 */ + rc = truncate(path, 0); + ok(rc == 0, "%s:%d truncate(%d) (rc=%d): %s", + __FILE__, __LINE__, 0, rc, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + free(buf); + + return 0; +} + +int truncate_bigempty(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* ftruncate at 1TB, expect 1TB */ + off_t bigempty = 1024*1024*1024*1024ULL; + rc = ftruncate(fd, bigempty); + ok(rc == 0, "%s:%d ftruncate(%llu) (rc=%d): %s", + __FILE__, __LINE__, (unsigned long long) bigempty, + rc, strerror(errno)); + + testutil_get_size(path, &global); + ok(global == (size_t)bigempty, "%s:%d global size is %llu expected %llu", + __FILE__, __LINE__, global, (unsigned long long)bigempty, + strerror(errno)); + + close(fd); + + free(buf); + + return 0; +} + +int truncate_eof(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* write 1MB */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* ftruncate at 0.5MB */ + rc = ftruncate(fd, bufsize/2); + ok(rc == 0, "%s:%d ftruncate(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize/2, rc, strerror(errno)); + + close(fd); + + /* Open a file for reading */ + fd = open(path, O_RDONLY); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* ask for 1MB, should only get 0.5MB back */ + rc = read(fd, buf, bufsize); + ok(rc == bufsize/2, "%s:%d read(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + /* then should get 0 since at EOF */ + rc = read(fd, buf, bufsize); + ok(rc == 0, "%s:%d read(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + close(fd); + + /* truncate to 0 */ + rc = truncate(path, 0); + ok(rc == 0, "%s:%d truncate(%d) (rc=%d): %s", + __FILE__, __LINE__, 0, rc, strerror(errno)); + + /* Open a file for reading */ + fd = open(path, O_RDONLY); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* should get 0 since immediately at EOF */ + rc = read(fd, buf, bufsize); + ok(rc == 0, "%s:%d read(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + close(fd); + + free(buf); + + return 0; +} + +int truncate_truncsync(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* write 1MB */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + /* ftruncate to 0.5MB */ + rc = ftruncate(fd, bufsize/2); + ok(rc == 0, "%s:%d ftruncate(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize/2, rc, strerror(errno)); + + /* file should be 0.5MB bytes at this point */ + testutil_get_size(path, &global); + ok(global == bufsize/2, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, bufsize/2); + + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* file should still be 0.5MB bytes at this point */ + testutil_get_size(path, &global); + ok(global == bufsize/2, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, bufsize/2); + + close(fd); + + free(buf); + + return 0; +} + +/* fill buffer with known pattern based on file offset */ +int fill_pattern(char* buf, size_t size, size_t start) +{ + size_t i; + for (i = 0; i < size; i++) { + char expected = ((i + start) % 26) + 'A'; + buf[i] = expected; + } + return 0; +} + +/* fill buffer with known pattern based on file offset */ +int check_pattern(char* buf, size_t size, size_t start) +{ + size_t i; + for (i = 0; i < size; i++) { + char expected = ((i + start) % 26) + 'A'; + if (buf[i] != expected) { + return (int)(i+1); + } + } + return 0; +} + +/* check that buffer is all zero */ +int check_zeros(char* buf, size_t size) +{ + size_t i; + for (i = 0; i < size; i++) { + if (buf[i] != (char)0) { + return (int)(i+1); + } + } + return 0; +} + +/* write a known pattern of a known size, truncate to something smaller, + * read until EOF and verify contents along the way */ +int truncate_pattern_size(char* unifyfs_root, off_t seekpos) +{ + char path[64]; + int rc; + int fd; + size_t global; + int i; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* write pattern out of 20 MB in size */ + size_t nwritten = 0; + for (i = 0; i < 20; i++) { + /* fill buffer with known pattern based on file offset */ + fill_pattern(buf, bufsize, nwritten); + + /* write data to file */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + /* track number of bytes written so far */ + nwritten += (size_t)rc; + } + + /* set size we'll truncate file to */ + off_t truncsize = 5*bufsize + 42; + + /* ftruncate to 5MB + 42 bytes */ + rc = ftruncate(fd, truncsize); + ok(rc == 0, "%s:%d ftruncate(%d) (rc=%d): %s", + __FILE__, __LINE__, (int)truncsize, rc, strerror(errno)); + + /* file should be of size 5MB + 42 at this point */ + testutil_get_size(path, &global); + ok(global == truncsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, (int)truncsize); + + /* this kind of tests that the ftruncate above implied an fsync, + * can't really since the writes may have gone to disk on their + * own before ftruncate call */ + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* file should still be 5MB + 42 bytes at this point */ + testutil_get_size(path, &global); + ok(global == truncsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, (int)truncsize); + + close(fd); + + /* read file back from offset 0 and verify size and contents */ + fd = open(path, O_RDONLY); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* see to position if file if seekpos is set */ + if (seekpos > 0) { + off_t pos = lseek(fd, seekpos, SEEK_SET); + ok(pos == seekpos, "%s:%d lseek(%d) (rc=%d): %s", + __FILE__, __LINE__, pos, rc, strerror(errno)); + } + + off_t numread = 0; + while (1) { + /* compute number of bytes we expect to read on next attempt */ + ssize_t expected = bufsize; + ssize_t remaining = (ssize_t)(truncsize - numread - seekpos); + if (expected > remaining) { + expected = remaining; + } + + /* ask for 1MB, should only get 0.5MB back */ + rc = read(fd, buf, bufsize); + ok(rc == expected, "%s:%d read(%d) (rc=%d) expected=%d %s", + __FILE__, __LINE__, bufsize, rc, expected, strerror(errno)); + + /* check that contents we read are correct */ + if (rc > 0) { + size_t start = numread + seekpos; + int check = check_pattern(buf, rc, start); + ok(check == 0, "%s:%d pattern check of bytes [%d, %d) rc=%d", + __FILE__, __LINE__, (int)start, (int)(start + rc), check); + + /* add to number of bytes read so far */ + numread += rc; + } + + /* break if we hit end of file */ + if (rc == 0) { + /* check that total read is expected size */ + ok(numread == (truncsize - seekpos), + "%s:%d read %d bytes, expected %d", + __FILE__, __LINE__, (int)numread, (int)(truncsize - seekpos)); + break; + } + + /* check that we don't run past expected + * end of file (and hang the test) */ + if (numread > (truncsize - seekpos)) { + ok(numread <= (truncsize - seekpos), + "%s:%d read %d bytes, expected %d", + __FILE__, __LINE__, (int)numread, (int)(truncsize - seekpos)); + break; + } + + /* break if we hit an error, would have been + * reported in read above */ + if (rc < 0) { + break; + } + } + + close(fd); + + free(buf); + + return 0; +} + +/* truncate an empty file to something and read until EOF, + * check size and contents of buffer */ +int truncate_empty_read(char* unifyfs_root, off_t seekpos) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* set size we'll truncate file to */ + off_t truncsize = 5*bufsize + 42; + + /* ftruncate to 5MB + 42 bytes */ + rc = ftruncate(fd, truncsize); + ok(rc == 0, "%s:%d ftruncate(%d) (rc=%d): %s", + __FILE__, __LINE__, (int)truncsize, rc, strerror(errno)); + + /* file should be of size 5MB + 42 at this point */ + testutil_get_size(path, &global); + ok(global == truncsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, (int)truncsize); + + /* this kind of tests that the ftruncate above implied an fsync, + * can't really since the writes may have gone to disk on their + * own before ftruncate call */ + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* file should still be 5MB + 42 bytes at this point */ + testutil_get_size(path, &global); + ok(global == truncsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, (int)truncsize); + + close(fd); + + /* read file back from offset 0 and verify size and contents */ + fd = open(path, O_RDONLY); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* see to position if file if seekpos is set */ + if (seekpos > 0) { + off_t pos = lseek(fd, seekpos, SEEK_SET); + ok(pos == seekpos, "%s:%d lseek(%d) (rc=%d): %s", + __FILE__, __LINE__, pos, rc, strerror(errno)); + } + + off_t numread = 0; + while (1) { + /* compute number of bytes we expect to read on next attempt */ + ssize_t expected = bufsize; + ssize_t remaining = (ssize_t)(truncsize - numread - seekpos); + if (expected > remaining) { + expected = remaining; + } + + /* ask for 1MB, should only get 0.5MB back */ + rc = read(fd, buf, bufsize); + ok(rc == expected, "%s:%d read(%d) (rc=%d) expected=%d %s", + __FILE__, __LINE__, bufsize, rc, expected, strerror(errno)); + + /* check that contents we read are correct */ + if (rc > 0) { + size_t start = numread + seekpos; + int check = check_zeros(buf, rc); + ok(check == 0, "%s:%d pattern check of bytes [%d, %d) rc=%d", + __FILE__, __LINE__, (int)start, (int)(start + rc), check); + + /* add to number of bytes read so far */ + numread += rc; + } + + /* break if we hit end of file */ + if (rc == 0) { + /* check that total read is expected size */ + ok(numread == (truncsize - seekpos), + "%s:%d read %d bytes, expected %d", + __FILE__, __LINE__, (int)numread, (int)(truncsize - seekpos)); + break; + } + + /* check that we don't run past expected + * end of file (and hang the test) */ + if (numread > (truncsize - seekpos)) { + ok(numread <= (truncsize - seekpos), + "%s:%d read %d bytes, expected %d", + __FILE__, __LINE__, (int)numread, (int)(truncsize - seekpos)); + break; + } + + /* break if we hit an error, would have been + * reported in read above */ + if (rc < 0) { + break; + } + } + + close(fd); + + free(buf); + + return 0; +} + +int truncate_ftrunc_before_sync(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* write a small amount, intended to be small enough that + * the write itself does not cause an implicit fsync */ + + /* write data to file */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + /* then truncate the file to 0 */ + off_t truncsize = 0; + rc = ftruncate(fd, truncsize); + ok(rc == 0, "%s:%d ftruncate(%d) (rc=%d): %s", + __FILE__, __LINE__, (int)truncsize, rc, strerror(errno)); + + /* then fsync the file */ + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* finally, check that the file is 0 bytes, + * i.e., check that the writes happened before the truncate + * and not at the fsync */ + testutil_get_size(path, &global); + ok(global == truncsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, (int)truncsize); + + close(fd); + + free(buf); + + return 0; +} + +int truncate_trunc_before_sync(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024; + char* buf = (char*) malloc(bufsize); + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* Open a new file for writing */ + fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* file should be 0 bytes at this point */ + testutil_get_size(path, &global); + ok(global == 0, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, 0); + + /* write a small amount, intended to be small enough that + * the write itself does not cause an implicit fsync */ + + /* write data to file */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write(%d) (rc=%d): %s", + __FILE__, __LINE__, bufsize, rc, strerror(errno)); + + /* then truncate the file to 0 */ + off_t truncsize = 0; + rc = truncate(path, truncsize); + ok(rc == 0, "%s:%d truncate(%d) (rc=%d): %s", + __FILE__, __LINE__, (int)truncsize, rc, strerror(errno)); + + /* then fsync the file */ + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* finally, check that the file is 0 bytes, + * i.e., check that the writes happened before the truncate + * and not at the fsync */ + testutil_get_size(path, &global); + ok(global == truncsize, "%s:%d global size is %d expected %d", + __FILE__, __LINE__, global, (int)truncsize); + + close(fd); + + free(buf); + + return 0; +} diff --git a/t/sys/unlink.c b/t/sys/unlink.c new file mode 100644 index 000000000..4f68768ef --- /dev/null +++ b/t/sys/unlink.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test unlink + */ +#include +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +static int unlink_after_sync_test(char* unifyfs_root) +{ + char path[64]; + int err, fd, rc; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + errno = 0; + fd = open(path, O_WRONLY | O_CREAT, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d write(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fsync(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close(): %s", + __FILE__, __LINE__, strerror(err)); + + struct stat sb = {0}; + errno = 0; + rc = stat(path, &sb); + err = errno; + ok(rc == 0 && err == 0, "%s:%d stat(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == 0 && err == 0, "%s:%d unlink(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = stat(path, &sb); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d stat() after unlink fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d unlink() already unlinked file fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + return 0; +} + +static int unlink_after_sync_laminate_test(char* unifyfs_root) +{ + char path[64]; + int err, fd, rc; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + errno = 0; + fd = open(path, O_WRONLY | O_CREAT, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == 12 && err == 0, "%s:%d write(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fsync(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close(): %s", + __FILE__, __LINE__, strerror(err)); + + /* Laminate */ + errno = 0; + rc = chmod(path, 0444); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chmod(0444): %s", + __FILE__, __LINE__, strerror(err)); + + struct stat sb = {0}; + errno = 0; + rc = stat(path, &sb); + err = errno; + ok(rc == 0 && err == 0, "%s:%d stat(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == 0 && err == 0, "%s:%d unlink(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = stat(path, &sb); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d stat() after unlink fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d unlink() already unlinked, laminated file fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + return 0; +} + +int unlink_test(char* unifyfs_root) +{ + diag("Finished UNIFYFS_WRAP(unlink) tests"); + + char path[64]; + char dir_path[64]; + int err, fd, rc; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + testutil_rand_path(dir_path, sizeof(dir_path), unifyfs_root); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d unlink() non-existent file fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + fd = creat(path, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d creat(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fsync(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close(): %s", + __FILE__, __LINE__, strerror(err)); + + struct stat sb = {0}; + errno = 0; + rc = stat(path, &sb); + err = errno; + ok(rc == 0 && err == 0, "%s:%d stat(): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d unlink() empty file: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = stat(path, &sb); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d stat() after unlink fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = unlink(path); + err = errno; + ok(rc == -1 && err == ENOENT, + "%s:%d unlink() already unlinked, empty file fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* Calling unlink() on a directory should fail */ + errno = 0; + rc = mkdir(dir_path, 0777); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d mkdir(%s): %s", + __FILE__, __LINE__, dir_path, strerror(err)); + + errno = 0; + rc = unlink(dir_path); + err = errno; + ok(rc == -1 && err == EISDIR, + "%s:%d unlink() a directory fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = rmdir(dir_path); + err = errno; + ok(rc == 0 && err == 0, "%s:%d rmdir(): %s", + __FILE__, __LINE__, strerror(err)); + + /* Tests for unlink after writing to a file */ + int ret = unlink_after_sync_test(unifyfs_root); + if (ret != 0) { + rc = ret; + } + + ret = unlink_after_sync_laminate_test(unifyfs_root); + if (ret != 0) { + rc = ret; + } + + diag("Finished UNIFYFS_WRAP(unlink) tests"); + + return rc; +} diff --git a/t/sys/write-read-hole.c b/t/sys/write-read-hole.c new file mode 100644 index 000000000..9a0c06f02 --- /dev/null +++ b/t/sys/write-read-hole.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test reading from file with holes + */ +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +static int check_contents(char* buf, size_t len, char c) +{ + int valid = 1; + size_t i; + for (i = 0; i < len; i++) { + if (buf[i] != c) { + valid = 0; + } + } + return valid; +} + +int write_read_hole_test(char* unifyfs_root) +{ + char path[64]; + int rc; + int fd; + size_t global; + + size_t bufsize = 1024*1024; + char* buf = (char*) malloc(bufsize); + int i; + for (i = 0; i < bufsize; i++) { + buf[i] = 1; + } + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* create a file that contains: + * [0, 1MB) - data = "1" + * [1MB, 2MB) - hole = "0" implied + * [2MB, 3MB) - data = "1" + * [3MB, 4MB) - hole = "0" implied */ + + /* Write to the file */ + fd = open(path, O_WRONLY | O_CREAT, 0222); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + /* write "1" to [0MB, 1MB) */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* skip over [1MB, 2MB) for implied "0" */ + rc = lseek(fd, 2*bufsize, SEEK_SET); + ok(rc == 2*bufsize, "%s:%d lseek() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* write "1" to [2MB, 3MB) */ + rc = write(fd, buf, bufsize); + ok(rc == bufsize, "%s:%d write() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* Check global size on our un-laminated file */ + testutil_get_size(path, &global); + ok(global == 3*bufsize, "%s:%d global size is %d: %s", + __FILE__, __LINE__, global, strerror(errno)); + + /* flush writes */ + rc = fsync(fd); + ok(rc == 0, "%s:%d fsync() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* Check global size on our un-laminated file */ + testutil_get_size(path, &global); + ok(global == 3*bufsize, "%s:%d global size is %d: %s", + __FILE__, __LINE__, global, strerror(errno)); + + /* truncate file at 4MB, extends file so that + * [3MB, 4MB) is implied "0" */ + rc = ftruncate(fd, 4*bufsize); + ok(rc == 0, "%s:%d ftruncate() (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* Laminate */ + rc = chmod(path, 0444); + ok(rc == 0, "%s:%d chmod(0444) (rc=%d): %s", + __FILE__, __LINE__, rc, strerror(errno)); + + /* Check global size on our un-laminated file */ + testutil_get_size(path, &global); + ok(global == 4*bufsize, "%s:%d global size is %d: %s", + __FILE__, __LINE__, global, strerror(errno)); + + close(fd); + + /*************** + * open file for reading + ***************/ + + fd = open(path, O_RDONLY); + ok(fd != -1, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(errno)); + + + /* read segment [0, 1MB) -- should be all "1" + * this should be a full read, all from actual data */ + memset(buf, 2, bufsize); + ssize_t nread = pread(fd, buf, bufsize, 0*bufsize); + ok(nread == bufsize, + "%s:%d pread expected=%llu got=%llu: errno=%s", + __FILE__, __LINE__, + (unsigned long long) bufsize, (unsigned long long) nread, + strerror(errno)); + + /* check that full buffer is "1" */ + int valid = check_contents(buf, bufsize, 1); + ok(valid == 1, "%s:%d data check", + __FILE__, __LINE__); + + + /* read segment [1MB, 2MB) -- should be all "0" + * this should be a full read, all from a hole */ + memset(buf, 2, bufsize); + nread = pread(fd, buf, bufsize, 1*bufsize); + ok(nread == bufsize, + "%s:%d pread expected=%llu got=%llu: errno=%s", + __FILE__, __LINE__, + (unsigned long long) bufsize, (unsigned long long) nread, + strerror(errno)); + + /* check that full buffer is "0" */ + valid = check_contents(buf, bufsize, 0); + ok(valid == 1, "%s:%d data check", + __FILE__, __LINE__); + + + /* read segment [0.5MB, 1.5MB) + * should be a full read, half data, half hole */ + memset(buf, 2, bufsize); + nread = pread(fd, buf, bufsize, bufsize/2); + ok(nread == bufsize, + "%s:%d pread expected=%llu got=%llu: errno=%s", + __FILE__, __LINE__, + (unsigned long long) bufsize, (unsigned long long) nread, + strerror(errno)); + + /* check that data portion is "1" */ + valid = check_contents(buf, bufsize/2, 1); + ok(valid == 1, "%s:%d data check", + __FILE__, __LINE__); + + /* check that hole portion is "0" */ + valid = check_contents(buf + bufsize/2, bufsize/2, 0); + ok(valid == 1, "%s:%d data check", + __FILE__, __LINE__); + + + /* read segment [3.5MB, 4.5MB) + * should read only half of requested amount, + * half hole, half past end of file */ + memset(buf, 2, bufsize); + nread = pread(fd, buf, bufsize, 3*bufsize + bufsize/2); + ok(nread == bufsize/2, + "%s:%d pread expected=%llu got=%llu: errno=%s", + __FILE__, __LINE__, + (unsigned long long) bufsize/2, (unsigned long long) nread, + strerror(errno)); + + /* first half of buffer should be "0" */ + valid = check_contents(buf, bufsize/2, 0); + ok(valid == 1, "%s:%d data check", + __FILE__, __LINE__); + + /* second half of buffer should not be changed, still "2" */ + valid = check_contents(buf + bufsize/2, bufsize/2, 2); + ok(valid == 1, "%s:%d data check", + __FILE__, __LINE__); + + + close(fd); + + free(buf); + + return 0; +} diff --git a/t/sys/write-read.c b/t/sys/write-read.c new file mode 100644 index 000000000..dadb88f00 --- /dev/null +++ b/t/sys/write-read.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2019, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2018, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + /* + * Test write/read/lseek/fsync/stat/chmod + */ +#include +#include +#include +#include +#include +#include +#include "t/lib/tap.h" +#include "t/lib/testutil.h" + +int write_read_test(char* unifyfs_root) +{ + diag("Starting UNIFYFS_WRAP(write/read) tests"); + + char path[64]; + char buf[64] = {0}; + int fd = -1; + int err, rc; + size_t global; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + /* write to bad file descriptor should fail with errno=EBADF */ + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d write() to bad file descriptor fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* read from bad file descriptor should fail with errno=EBADF */ + errno = 0; + rc = (int) read(fd, buf, 12); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d read() from bad file descriptor fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* Write "hello world" to the file */ + errno = 0; + fd = open(path, O_WRONLY | O_CREAT, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == 12 && err == 0, + "%s:%d write(\"hello world\") to file: %s", + __FILE__, __LINE__, strerror(err)); + + /* Write to a different offset by overwriting "world" with "universe" */ + errno = 0; + rc = (int) lseek(fd, 6, SEEK_SET); + err = errno; + ok(rc == 6 && err == 0, + "%s:%d lseek(6) to \"world\": %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) write(fd, "universe", 9); + err = errno; + ok(rc == 9 && err == 0, + "%s:%d overwrite \"world\" at offset 6 with \"universe\": %s", + __FILE__, __LINE__, strerror(err)); + + /* Check global size on our un-laminated and un-synced file */ + testutil_get_size(path, &global); + ok(global == 15, "%s:%d global size before fsync is %d: %s", + __FILE__, __LINE__, global, strerror(err)); + + errno = 0; + rc = fsync(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d fsync() worked: %s", + __FILE__, __LINE__, strerror(err)); + + /* Check global size on our un-laminated file */ + testutil_get_size(path, &global); + ok(global == 15, "%s:%d global size after fsync is %d: %s", + __FILE__, __LINE__, global, strerror(err)); + + /* read from file open as write-only should fail with errno=EBADF */ + errno = 0; + rc = (int) lseek(fd, 0, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lseek(0): %s", + __FILE__, __LINE__, strerror(err)); + + todo("Successfully reads and gets 0 bytes back"); + errno = 0; + rc = (int) read(fd, buf, 15); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d read() from file open as write-only (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + end_todo; + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); + + /* Test O_APPEND */ + errno = 0; + fd = open(path, O_WRONLY | O_APPEND, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open(%s, O_APPEND) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + /* + * Seek to an offset in the file and write. Since it's O_APPEND, the + * offset we seeked to doesn't matter - all writes go to the end. + */ + errno = 0; + rc = (int) lseek(fd, 3, SEEK_SET); + err = errno; + ok(rc == 3 && err == 0, + "%s:%d lseek(3) worked: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) write(fd, "", 6); + err = errno; + ok(rc == 6 && err == 0, + "%s:%d append write(\"\"): %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); + + /* Check global size on our un-laminated file */ + testutil_get_size(path, &global); + ok(global == 21, "%s:%d global size before laminate is %d: %s", + __FILE__, __LINE__, global, strerror(err)); + + /* Laminate */ + errno = 0; + rc = chmod(path, 0444); + err = errno; + ok(rc == 0 && err == 0, "%s:%d chmod(0444): %s", + __FILE__, __LINE__, strerror(err)); + + /* Verify we're getting the correct file size */ + testutil_get_size(path, &global); + ok(global == 21, "%s:%d global size after laminate is %d: %s", + __FILE__, __LINE__, global, strerror(err)); + + /* open laminated file for write should fail with errno=EROFS */ + errno = 0; + fd = open(path, O_WRONLY | O_CREAT, 0222); + err = errno; + ok(fd == -1 && err == EROFS, + "%s:%d open() laminated file for write fails (fd=%d, errno=%d): %s", + __FILE__, __LINE__, fd, err, strerror(err)); + + /* read() tests */ + errno = 0; + fd = open(path, O_RDONLY, 0444); + err = errno; + ok(fd != -1 && err == 0, + "%s:%d open(%s, O_RDONLY) for read (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + /* write to file open as read-only should fail with errno=EBADF */ + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d write() to file open as read-only fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + errno = 0; + rc = (int) read(fd, buf, 21); + err = errno; + ok(rc == 21 && err == 0, + "%s:%d read() buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + buf[14] = ' '; /* replace '\0' between initial write and append */ + is(buf, "hello universe ", "%s:%d read() saw \"hello universe \"", + __FILE__, __LINE__); + + /* Seek and read at a different position */ + errno = 0; + rc = (int) lseek(fd, 6, SEEK_SET); + err = errno; + ok(rc == 6 && err == 0, + "%s:%d lseek(6) worked: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) read(fd, buf, 9); + err = errno; + ok(rc == 9 && err == 0, + "%s:%d read() at offset 6 buf[]=\"%s\": %s", + __FILE__, __LINE__, buf, strerror(err)); + is(buf, "universe", "%s:%d read() saw \"universe\"", __FILE__, __LINE__); + + errno = 0; + rc = (int) lseek(fd, 0, SEEK_SET); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lseek(0) worked: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = (int) read(fd, buf, sizeof(buf)); + err = errno; + ok(rc == 21 && err == 0, + "%s:%d read() past end of file: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); + + /* write to closed file descriptor should fail with errno=EBADF */ + errno = 0; + rc = (int) write(fd, "hello world", 12); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d write() to bad file descriptor fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + /* read from closed file descriptor should fail with errno=EBADF */ + errno = 0; + rc = (int) read(fd, buf, 12); + err = errno; + ok(rc == -1 && err == EBADF, + "%s:%d read() from bad file descriptor fails (errno=%d): %s", + __FILE__, __LINE__, err, strerror(err)); + + diag("Finished UNIFYFS_WRAP(write/read) tests"); + + return 0; +} + +/* Test to reproduce issue 488 */ +int write_pre_existing_file_test(char* unifyfs_root) +{ + diag("Starting write-to-pre-existing-file tests"); + + char path[64]; + char buf[300] = {0}; + int fd = -1; + int err, rc; + size_t global; + + testutil_rand_path(path, sizeof(path), unifyfs_root); + + errno = 0; + fd = open(path, O_RDWR | O_CREAT, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + /* Write 300 bytes to a file */ + errno = 0; + rc = (int) write(fd, "a", 300); + err = errno; + ok(rc == 300 && err == 0, + "%s:%d write() a 300 byte file: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); + + /* Check global size is 300 */ + testutil_get_size(path, &global); + ok(global == 300, "%s:%d global size of 300 byte file is %d: %s", + __FILE__, __LINE__, global, strerror(err)); + + /* Reopen the same file */ + errno = 0; + fd = open(path, O_RDWR, 0222); + err = errno; + ok(fd != -1 && err == 0, "%s:%d open(%s) (fd=%d): %s", + __FILE__, __LINE__, path, fd, strerror(err)); + + /* Overwrite the first 100 bytes of same file */ + errno = 0; + rc = (int) write(fd, buf, 100); + err = errno; + ok(rc == 100 && err == 0, + "%s:%d overwrite first 100 bytes of same file: %s", + __FILE__, __LINE__, strerror(err)); + + errno = 0; + rc = close(fd); + err = errno; + ok(rc == 0 && err == 0, "%s:%d close() worked: %s", + __FILE__, __LINE__, strerror(err)); + + /* Check global size is 300 */ + testutil_get_size(path, &global); + ok(global == 300, "%s:%d global size of 300 byte file is %d: %s", + __FILE__, __LINE__, global, strerror(err)); + + diag("Finished write-to-pre-existing-file tests"); + + return 0; +} diff --git a/util/Makefile.am b/util/Makefile.am index ca451f459..69e40c9c1 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1 +1 @@ -SUBDIRS = scripts unifyfs +SUBDIRS = scripts unifyfs unifyfs-stage diff --git a/util/unifyfs-stage/Makefile.am b/util/unifyfs-stage/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/util/unifyfs-stage/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/util/unifyfs-stage/src/Makefile.am b/util/unifyfs-stage/src/Makefile.am new file mode 100644 index 000000000..00e671a96 --- /dev/null +++ b/util/unifyfs-stage/src/Makefile.am @@ -0,0 +1,20 @@ +libexec_PROGRAMS = unifyfs-stage + +unifyfs_stage_SOURCES = unifyfs-stage.c \ + unifyfs-stage-transfer.c + +noinst_HEADERS = unifyfs-stage.h + +unifyfs_stage_CPPFLAGS = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ + $(OPENSSL_CFLAGS) \ + -I$(top_srcdir)/client/src \ + -I$(top_srcdir)/common/src + +unifyfs_stage_LDADD = $(top_builddir)/client/src/libunifyfs.la -lrt -lm + +unifyfs_stage_LDFLAGS = -static $(CP_WRAPPERS) $(AM_LDFLAGS) \ + $(MPI_CLDFLAGS) $(OPENSSL_LIBS) + +AM_CFLAGS = -Wall -Werror + +CLEANFILES = $(libexec_PROGRAMS) diff --git a/util/unifyfs-stage/src/unifyfs-stage-transfer.c b/util/unifyfs-stage/src/unifyfs-stage-transfer.c new file mode 100644 index 000000000..f70e6d3dc --- /dev/null +++ b/util/unifyfs-stage/src/unifyfs-stage-transfer.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "unifyfs-stage.h" + +/** + * @brief Run md5 checksum on specified file, send back + * digest. + * + * @param path path to the target file + * @param digest hash of the file + * + * @return 0 on success, errno otherwise + */ +static int md5_checksum(const char* path, unsigned char* digest) +{ + int ret = 0; + size_t len = 0; + int fd = -1; + unsigned char data[UNIFYFS_STAGE_MD5_BLOCKSIZE] = { 0, }; + MD5_CTX md5; + + fd = open(path, O_RDONLY); + if (fd < 0) { + perror("open"); + return errno; + } + + ret = MD5_Init(&md5); + if (!ret) { + fprintf(stderr, "failed to create md5 context\n"); + goto out; + } + + while ((len = read(fd, (void*) data, UNIFYFS_STAGE_MD5_BLOCKSIZE)) != 0) { + ret = MD5_Update(&md5, data, len); + if (!ret) { + fprintf(stderr, "failed to update checksum\n"); + goto out; + } + } + + ret = MD5_Final(digest, &md5); + if (!ret) { + fprintf(stderr, "failed to finalize md5\n"); + } + +out: + /* MD5_xx returns 1 for success */ + ret = (ret == 1 ? 0 : EIO); + close(fd); + + return ret; +} + +/** + * @brief prints md5 checksum into string + * + * @param buf buffer to print into + * @param digest hash of the file + * + * @return buffer that has been printed to + */ +static char* checksum_str(char* buf, unsigned char* digest) +{ + int i = 0; + char* pos = buf; + + for (i = 0; i < MD5_DIGEST_LENGTH; i++) { + pos += sprintf(pos, "%02x", digest[i]); + } + + pos[0] = '\0'; + + return buf; +} + +/** + * @brief takes check sums of two files and compares + * + * @param src path to one file + * @param dst path to the other file + * + * @return 0 if files are identical, non-zero if not, or other error + */ +static int verify_checksum(const char* src, const char* dst) +{ + int ret = 0; + int i = 0; + char md5src[2 * MD5_DIGEST_LENGTH + 1] = { 0, }; + char md5dst[2 * MD5_DIGEST_LENGTH + 1] = { 0, }; + unsigned char src_digest[MD5_DIGEST_LENGTH + 1] = { 0, }; + unsigned char dst_digest[MD5_DIGEST_LENGTH + 1] = { 0, }; + + src_digest[MD5_DIGEST_LENGTH] = '\0'; + dst_digest[MD5_DIGEST_LENGTH] = '\0'; + + ret = md5_checksum(src, src_digest); + if (ret) { + fprintf(stderr, "failed to calculate checksum for %s (%s)\n", + src, strerror(ret)); + return ret; + } + + ret = md5_checksum(dst, dst_digest); + if (ret) { + fprintf(stderr, "failed to calculate checksum for %s (%s)\n", + dst, strerror(ret)); + return ret; + } + + if (verbose) { + printf("[%d] src: %s, dst: %s\n", rank, + checksum_str(md5src, src_digest), + checksum_str(md5dst, dst_digest)); + } + + for (i = 0; i < MD5_DIGEST_LENGTH; i++) { + if (src_digest[i] != dst_digest[i]) { + fprintf(stderr, "[%d] checksum verification failed: " + "(src=%s, dst=%s)\n", rank, + checksum_str(md5src, src_digest), + checksum_str(md5dst, dst_digest)); + ret = EIO; + } + } + + return ret; +} + +/* + * Parse a line from the manifest in the form of: + * + * + * + * If the paths have spaces, they must be quoted. + * + * On success, return 0 along with allocated src and dest strings. These + * must be freed when you're finished with them. On failure return non-zero, + * and set src and dest to NULL. + * + * Note, leading and tailing whitespace are ok. They just get ignored. + * Lines with only whitespace are ignored. A line of all whitespace will + * return 0, with src and dest being NULL, so users should not check for + * 'if (*src == NULL)' to see if the function failed. They should be looking + * at the return code. + */ +/** + * @brief parses manifest file line, passes back src and dst strings + * + * @param line input manifest file line + * @param src return val of src filename + * @param dst return val of dst filename + * + * @return 0 if all was well, or there was nothing; non-zero on error + */ +int +unifyfs_parse_manifest_line(char* line, char** src, char** dest) +{ + char* new_src = NULL; + char* new_dest = NULL; + char* copy; + char* tmp; + unsigned long copy_len; + int i; + unsigned int tmp_count; + int in_quotes = 0; + int rc = 0; + + copy = strdup(line); + copy_len = strlen(copy) + 1;/* +1 for '\0' */ + + /* Replace quotes and separator with '\0' */ + for (i = 0; i < copy_len; i++) { + if (copy[i] == '"') { + in_quotes ^= 1;/* toggle */ + copy[i] = '\0'; + } else if (isspace(copy[i]) && !in_quotes) { + /* + * Allow any whitespace for our separator + */ + copy[i] = '\0'; + } + } + + /* + * copy[] now contains a series of strings, one after the other + * (possibly containing some NULL strings, which we ignore) + */ + tmp = copy; + while (tmp < copy + copy_len) { + tmp_count = strlen(tmp); + if (tmp_count > 0) { + /* We have a real string */ + if (!new_src) { + new_src = strdup(tmp); + } else { + if (!new_dest) { + new_dest = strdup(tmp); + } else { + /* Error: a third file name */ + rc = 1; + break; + } + } + } + tmp += tmp_count + 1; + } + + /* Some kind of error parsing a line */ + if (rc != 0 || (new_src && !new_dest)) { + fprintf(stderr, "manifest file line >>%s<< is invalid!\n", + line); + free(new_src); + free(new_dest); + new_src = NULL; + new_dest = NULL; + if (rc == 0) { + rc = 1; + } + } + + *src = new_src; + *dest = new_dest; + + free(copy); + return rc; +} + +/** + * @brief controls the action of the stage-in or stage-out. Opens up + * the manifest file, sends each line to be parsed, and fires + * each source/destination to be staged. + * + * @param ctx stage context and instructions + * + * @return 0 indicates success, non-zero is error + */ +int unifyfs_stage_transfer(unifyfs_stage_t* ctx) +{ + int ret = 0; + int count = 0; + FILE* fp = NULL; + char* src = NULL; + char* dst = NULL; + char linebuf[LINE_MAX] = { 0, }; + + if (!ctx) { + return EINVAL; + } + + fp = fopen(ctx->manifest_file, "r"); + if (!fp) { + fprintf(stderr, "failed to open file %s: %s\n", + ctx->manifest_file, strerror(errno)); + ret = errno; + goto out; + } + + while (NULL != fgets(linebuf, LINE_MAX - 1, fp)) { + if (strlen(linebuf) < 5) { + if (linebuf[0] == '\n') { + // manifest file ends in a blank line + goto out; + } else { + fprintf(stderr, "Short (bad) manifest file line: >%s<\n", + linebuf); + ret = -EINVAL; + goto out; + } + } + ret = unifyfs_parse_manifest_line(linebuf, &src, &dst); + if (ret < 0) { + fprintf(stderr, "failed to parse %s (%s)\n", + linebuf, strerror(ret)); + goto out; + } + if (ctx->mode == UNIFYFS_STAGE_SERIAL) { + if (count % total_ranks == rank) { + if (verbose) { + fprintf(stdout, "[%d] serial transfer: src=%s, dst=%s\n", + rank, src, dst); + } + + ret = unifyfs_transfer_file_serial(src, dst); + if (ret) { + goto out; + } + + if (ret < 0) { + fprintf(stderr, "stat on %s failed (err=%d, %s)\n", + dst, errno, strerror(errno)); + ret = errno; + goto out; + } + + if (ctx->checksum) { + ret = verify_checksum(src, dst); + if (ret) { + fprintf(stderr, "checksums for >%s< and >%s< differ!\n", + src, dst); + goto out; + } + } + } + } else { + if (0 == rank) { + int fd = -1; + + if (verbose) { + fprintf(stdout, "[%d] parallel transfer: src=%s, dst=%s\n", + rank, src, dst); + } + + fd = open(dst, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + fprintf(stderr, "[%d] failed to create the file %s\n", + rank, dst); + goto out; + } + close(fd); + } + + MPI_Barrier(MPI_COMM_WORLD); + + ret = unifyfs_transfer_file_parallel(src, dst); + if (ret) { + goto out; + } + + MPI_Barrier(MPI_COMM_WORLD); + + // possible lamination check or force lamination + // may need to go here + + if (ctx->checksum && 0 == rank) { + ret = verify_checksum(src, dst); + if (ret) { + goto out; + } + } + } + + count++; + } +out: + if (ret) { + fprintf(stderr, "failed to transfer file (src=%s, dst=%s): %s\n", + src, dst, strerror(ret)); + } + + if (fp) { + fclose(fp); + fp = NULL; + } + + return ret; +} + diff --git a/util/unifyfs-stage/src/unifyfs-stage.c b/util/unifyfs-stage/src/unifyfs-stage.c new file mode 100644 index 000000000..169c6a325 --- /dev/null +++ b/util/unifyfs-stage/src/unifyfs-stage.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ +/* unifyfs-stage: this application is supposed to excuted by the unifyfs + * command line utility for: + * - stage in: moving files in pfs to unifyfs volume before user starts + * application, + * e.g., unifyfs start --stage-in= + * - stage out: moving files in the unifyfs volume to parallel file system + * after user application completes, + * e.g., unifyfs terminate --stage-out= + * + * Currently, we request users to pass the to specify target + * files to be transferred. The should list all target files + * and their destinations, line by line. + * + * This supports two transfer modes (although both are technically parallel): + * + * - serial: Each process will transfer a file. Data of a single file will + * reside in a single compute node. + * - parallel (-p, --parallel): Each file will be split and transferred by all + * processes. Data of a single file will be spread evenly across all + * available compute nodes. + * + * TODO: + * Maybe later on, it would be better to have a size threshold. Based on the + * threshold, we can determine whether a file needs to transferred serially (if + * smaller than threshold), or parallelly. + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "unifyfs_const.h" +#include "unifyfs-stage.h" + +int rank; +int total_ranks; +int verbose; + +static int debug; +static int checksum; +static int mode; +static int should_we_mount_unifyfs = 1; +static char* manifest_file; +static char* mountpoint = "/unifyfs"; +static char* share_dir; + +static unifyfs_stage_t _ctx; + +/** + * @brief create a status (lock) file to notify the unifyfs executable + * when the staging is finished + * + * @param status 0 indicates success + * + * @return 0 on success, errno otherwise + */ +static int create_status_file(int status) +{ + char filename[PATH_MAX]; + FILE* fp = NULL; + const char* msg = status ? "fail" : "success"; + int return_val_from_scnprintf; + + return_val_from_scnprintf = + scnprintf(filename, PATH_MAX, + "%s/%s", share_dir, UNIFYFS_STAGE_STATUS_FILENAME); + if (return_val_from_scnprintf > (PATH_MAX - 1)) { + fprintf(stderr, "Stage status file is too long!\n"); + return -ENOMEM; + } + + fp = fopen(filename, "w"); + if (!fp) { + fprintf(stderr, "failed to create %s (%s)\n", + filename, strerror(errno)); + return errno; + } + + fprintf(fp, "%s\n", msg); + + fclose(fp); + + return 0; +} + +static struct option long_opts[] = { + { "checksum", 0, 0, 'c' }, + { "debug", 0, 0, 'd' }, + { "help", 0, 0, 'h' }, + { "mountpoint", 1, 0, 'm' }, + { "parallel", 0, 0, 'p' }, + { "share-dir", 1, 0, 's' }, + { "verbose", 0, 0, 'v' }, + { "no-mount-unifyfs", 0, 0, 'N' }, + { 0, 0, 0, 0 }, +}; + +static char* short_opts = "cdhm:ps:vN"; + +static const char* usage_str = + "\n" + "Usage: %s [OPTION]... \n" + "\n" + "Transfer files between unifyfs volume and external file system.\n" + "The should contain list of files to be transferred,\n" + "and each line should be formatted as\n" + "\n" + " /source/file/path /destination/file/path\n" + "\n" + "OR in the case of filenames with spaces or special characters:\n" + "\n" + " \"/source/file/path\" \"/destination/file/path\"\n" + "\n" + "One file per line; Specifying directories is not supported.\n" + "\n" + "Available options:\n" + "\n" + " -c, --checksum verify md5 checksum for each transfer\n" + " -h, --help print this usage\n" + " -m, --mountpoint= use as unifyfs mountpoint\n" + " (default: /unifyfs)\n" + " -p, --parallel transfer each file in parallel\n" + " (experimental)\n" + " -s, --share-dir= directory path for creating status file\n" + " -v, --verbose print noisy outputs\n" + " -N, --no-mount-unifyfs don't mount unifyfs file system (for testing)\n" + "\n" + "Without the '-p, --parallel' option, a file is transferred by a single\n" + "process. If the '-p, --parallel' option is specified, each file will be\n" + "divided by multiple processes and transferred in parallel.\n" + "\n"; + +static char* program; + +static void print_usage(void) +{ + if (0 == rank) { + fprintf(stdout, usage_str, program); + } +} + +static +void debug_pause(int rank, const char* fmt, ...) +{ + if (rank == 0) { + va_list args; + + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + + fprintf(stderr, " ENTER to continue ... "); + + (void) getchar(); + } + + MPI_Barrier(MPI_COMM_WORLD); + + /* internal accept() call from mpi may set errno */ + errno = 0; +} + +static int parse_option(int argc, char** argv) +{ + int ch = 0; + int optidx = 0; + char* filepath = NULL; + + if (argc < 2) { + return EINVAL; + } + + while ((ch = getopt_long(argc, argv, + short_opts, long_opts, &optidx)) >= 0) { + switch (ch) { + case 'c': + checksum = 1; + break; + + case 'd': + debug = 1; + break; + + case 'm': + mountpoint = strdup(optarg); + break; + + case 'p': + mode = UNIFYFS_STAGE_PARALLEL; + break; + + case 's': + share_dir = strdup(optarg); + break; + + case 'v': + verbose = 1; + break; + + case 'N': + fprintf(stderr, "WARNING: not mounting unifyfs file system!\n"); + should_we_mount_unifyfs = 0; + break; + + case 'h': + default: + break; + } + } + + if (argc - optind != 1) { + return EINVAL; + } + + filepath = argv[optind]; + + manifest_file = realpath(filepath, NULL); + if (!manifest_file) { + fprintf(stderr, "problem with accessing file %s: %s\n", + filepath, strerror(errno)); + return errno; + } + + return 0; +} + +int main(int argc, char** argv) +{ + int ret = 0; + unifyfs_stage_t* ctx = &_ctx; + + program = basename(strdup(argv[0])); + + ret = parse_option(argc, argv); + if (ret) { + if (EINVAL == ret) { + print_usage(); + } + goto preMPIout; + } + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + ctx->rank = rank; + ctx->total_ranks = total_ranks; + ctx->checksum = checksum; + ctx->mode = mode; + ctx->mountpoint = mountpoint; + ctx->manifest_file = manifest_file; + + if (verbose) { + unifyfs_stage_print(ctx); + } + + if (debug) { + debug_pause(rank, "About to mount unifyfs.. "); + } + + if (should_we_mount_unifyfs) { + ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + if (ret) { + fprintf(stderr, "failed to mount unifyfs at %s (%s)", + ctx->mountpoint, strerror(ret)); + goto out; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + ret = unifyfs_stage_transfer(ctx); + if (ret) { + fprintf(stderr, "data transfer failed (%s)\n", strerror(errno)); + } + + /* wait until all processes are done */ + MPI_Barrier(MPI_COMM_WORLD); + + if (share_dir && rank == 0) { + ret = create_status_file(ret); + if (ret) { + fprintf(stderr, "failed to create the status file (%s)\n", + strerror(errno)); + } + } + + if (should_we_mount_unifyfs) { + ret = unifyfs_unmount(); + if (ret) { + fprintf(stderr, "unmounting unifyfs failed (ret=%d)\n", ret); + } + } +out: + MPI_Finalize(); +preMPIout: + + return ret; +} + diff --git a/util/unifyfs-stage/src/unifyfs-stage.h b/util/unifyfs-stage/src/unifyfs-stage.h new file mode 100644 index 000000000..d9a12697c --- /dev/null +++ b/util/unifyfs-stage/src/unifyfs-stage.h @@ -0,0 +1,64 @@ +#ifndef __UNIFYFS_STAGE_H +#define __UNIFYFS_STAGE_H + +#include +#include +#include + +#define UNIFYFS_STAGE_MD5_BLOCKSIZE (1048576) + +/* + * serial: each file is tranferred by a process. + * parallel: a file is transferred by all processes. + */ +enum { + UNIFYFS_STAGE_SERIAL = 0, + UNIFYFS_STAGE_PARALLEL = 1, +}; + +struct _unifyfs_stage { + int rank; /* my rank */ + int total_ranks; /* mpi world size */ + + int checksum; /* perform checksum? 0:no, 1:yes */ + int mode; /* transfer mode? 0:serial, 1:parallel */ + int should_we_mount_unifyfs; /* mount? 0:no (for testing), 1: yes */ + char* mountpoint; /* unifyfs mountpoint */ + char* manifest_file; /* manifest file containing the transfer list */ +}; + +typedef struct _unifyfs_stage unifyfs_stage_t; + +static inline void unifyfs_stage_print(unifyfs_stage_t* ctx) +{ + printf("== unifyfs stage context ==\n" + "rank = %d\n" + "total ranks = %d\n" + "checksum = %d\n" + "mode = %d\n" + "should_we_mount_unifyfs = %d\n" + "mountpoint = %s\n" + "manifest file = %s\n", + ctx->rank, + ctx->total_ranks, + ctx->checksum, + ctx->mode, + ctx->should_we_mount_unifyfs, + ctx->mountpoint, + ctx->manifest_file); +} + +/** + * @brief transfer files specified in @ctx + * + * @param ctx unifyfs_stage_t data transfer context + * + * @return 0 on success, errno otherwise + */ +int unifyfs_stage_transfer(unifyfs_stage_t* ctx); + +extern int verbose; +extern int rank; +extern int total_ranks; + +#endif /* __UNIFYFS_STAGE_H */ diff --git a/util/unifyfs/src/Makefile.am b/util/unifyfs/src/Makefile.am index 9ae1304f1..c82fc8b8f 100644 --- a/util/unifyfs/src/Makefile.am +++ b/util/unifyfs/src/Makefile.am @@ -9,9 +9,10 @@ unifyfs_LDADD = $(top_builddir)/common/src/libunifyfs_common.la AM_CPPFLAGS = -I$(top_srcdir)/common/src \ -DBINDIR=\"$(bindir)\" \ - -DSBINDIR=\"$(sbindir)\" + -DSBINDIR=\"$(sbindir)\" \ + -DLIBEXECDIR=\"$(libexecdir)\" -AM_CFLAGS = -Wall +AM_CFLAGS = -Wall -Werror CLEANFILES = $(bin_PROGRAMS) diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index 46f030a3a..f69b9bd2c 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -44,7 +44,9 @@ #include #include #include +#include #include +#include #include "unifyfs.h" @@ -56,12 +58,16 @@ typedef int (*unifyfs_rm_launch_t)(unifyfs_resource_t* resource, typedef int (*unifyfs_rm_terminate_t)(unifyfs_resource_t* resource, unifyfs_args_t* args); +typedef int (*unifyfs_rm_stage_t)(unifyfs_resource_t* resource, + unifyfs_args_t* args); + struct _ucr_resource_manager { const char* type; unifyfs_rm_read_resource_t read_resource; unifyfs_rm_launch_t launch; unifyfs_rm_terminate_t terminate; + unifyfs_rm_stage_t stage; }; typedef struct _ucr_resource_manager _ucr_resource_manager_t; @@ -101,7 +107,7 @@ static int parse_hostfile(unifyfs_resource_t* resource, int i = 0; FILE* fp = NULL; char** nodes = NULL; - char buf[1024] = { 0, }; + char buf[1024]; if (hostfile == NULL) { return -EINVAL; @@ -184,6 +190,241 @@ static int write_hostfile(unifyfs_resource_t* resource, return ret; } +/** + * @brief wait until servers become ready for client connections + * + * @param resource The job resource record + * @param args The command-line options + * + * @return 0 on success, negative errno otherwise + */ +static int wait_server_initialization(unifyfs_resource_t* resource, + unifyfs_args_t* args) +{ + int ret = UNIFYFS_SUCCESS; + int count = 0; + unsigned int interval = 3; + unsigned int wait_time = 0; + FILE* fp = NULL; + char linebuf[32]; + char filename[PATH_MAX]; + int return_val_from_scnprintf; + + return_val_from_scnprintf = + scnprintf(filename, PATH_MAX, + "%s/%s", args->share_dir, UNIFYFSD_PID_FILENAME); + if (return_val_from_scnprintf > (PATH_MAX - 2)) { + fprintf(stderr, "Unifyfs status filename is too long!\n"); + return -ENOMEM; + } + + while (1) { + int err; + errno = 0; + fp = fopen(filename, "r"); + err = errno; + if (fp) { + while (fgets(linebuf, 31, fp) != NULL) { + count++; + } + + if (count != resource->n_nodes) { + fprintf(stderr, + "incorrect server initialization: " + "expected %lu processes but only %u processes found\n", + resource->n_nodes, count); + ret = UNIFYFS_FAILURE; + } + + fclose(fp); + break; + } else if (err != ENOENT) { + fprintf(stderr, "failed to open file %s (%s)\n", + filename, strerror(err)); + ret = -err; + break; + } + + wait_time += interval; + sleep(interval); + + if (wait_time > args->timeout) { + ret = UNIFYFS_FAILURE; + break; + } + } + + return ret; +} + +enum { + UNIFYFS_STAGE_IN = 0, + UNIFYFS_STAGE_OUT = 1, +}; + +static inline unsigned int estimate_timeout(const char* manifest_file) +{ + /* crude guess: 20 minutes */ + return 20 * 60; +} + + +/** + * @brief wait until data stage operation finishes + * + * @param resource + * @param args + * + * @return + */ +static +int wait_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) +{ + int ret = UNIFYFS_SUCCESS; + unsigned int interval = 5; + unsigned int wait_time = 0; + unsigned int timeout = 0; + FILE* fp = NULL; + const char* manifest_file = NULL; + char filename[PATH_MAX]; + char linebuf[16]; + int return_val_from_scnprintf; + + return_val_from_scnprintf = + scnprintf(filename, PATH_MAX, + "%s/%s", args->share_dir, UNIFYFS_STAGE_STATUS_FILENAME); + if (return_val_from_scnprintf > (PATH_MAX - 2)) { + fprintf(stderr, "Unifyfs status filename is too long!\n"); + return -ENOMEM; + } + + if (args->stage_timeout > 0) { + timeout = args->stage_timeout; + } else { + timeout = estimate_timeout(manifest_file); + } + + while (1) { + fp = fopen(filename, "r"); + if (fp) { + char* line = fgets(linebuf, 15, fp); + if (0 == strncmp("success", line, strlen("success"))) { + fclose(fp); + fp = NULL; + ret = 0; + break; // transfer completed + } else if (0 == strncmp("fail", line, strlen("fail"))) { + fclose(fp); + fp = NULL; + ret = -EIO; + break; // transfer failed + } else { + fclose(fp); // try again + } + } + + + if (errno != ENOENT) { + fprintf(stderr, "failed to open file %s (%s)\n", + UNIFYFS_STAGE_STATUS_FILENAME, strerror(errno)); + ret = -errno; + break; + } + + wait_time += interval; + sleep(interval); + + if (wait_time > timeout) { + ret = UNIFYFS_FAILURE; + break; + } + } + + return ret; +} + +/** + * @brief remove server pid file if exists (possibly from previous run). + * returns 0 (success) if the pid file does not exist. + * + * @return 0 on success, negative errno otherwise + */ +static int remove_server_pid_file(unifyfs_args_t* args) +{ + int ret = 0; + char filename[PATH_MAX]; + int return_val_from_scnprintf; + + return_val_from_scnprintf = + scnprintf(filename, PATH_MAX, + "%s/%s", args->share_dir, UNIFYFSD_PID_FILENAME); + if (return_val_from_scnprintf > (PATH_MAX - 2)) { + fprintf(stderr, "Unifyfs status filename is too long!\n"); + return -ENOMEM; + } + + ret = unlink(filename); + if (ret) { + if (ENOENT == errno) { + ret = 0; + } else { + fprintf(stderr, "failed to unlink existing pid file %s (%s)\n", + filename, strerror(errno)); + ret = -errno; + } + } + + return ret; +} + +/** + * @brief remove stagein/out status file if exists (possibly from previous run). + * returns 0 (success) if the pid file does not exist. + * + * @return 0 on success, negative errno otherwise + */ +static int remove_stage_status_file(unifyfs_args_t* args) +{ + int ret = 0; + char filename[PATH_MAX]; + int return_val_from_scnprintf; + + return_val_from_scnprintf = + scnprintf(filename, PATH_MAX, + "%s/%s", args->share_dir, UNIFYFS_STAGE_STATUS_FILENAME); + if (return_val_from_scnprintf > (PATH_MAX - 2)) { + fprintf(stderr, "Unifyfs stage status filename is too long!\n"); + return -ENOMEM; + } + + ret = unlink(filename); + if (ret) { + if (ENOENT == errno) { + ret = 0; + } else { + fprintf(stderr, "failed to unlink existing stage status file " + "%s (%s)\n", filename, strerror(errno)); + ret = -errno; + } + } + + return ret; +} + +static inline char* str_rtrim(char* str) +{ + if (str) { + char* pos = &str[strlen(str) - 1]; + + while (pos >= str && isspace(*pos)) { + *pos = '\0'; + pos--; + } + } + + return str; +} + /** * @brief Get node list from $LSB_HOSTS or $LSB_MCPU_HOSTS. * @@ -196,7 +437,7 @@ static int lsf_read_resource(unifyfs_resource_t* resource) size_t i, n_nodes; char* val; char* node; - char* last_node; + char* last_node = NULL; char* lsb_hosts; char* pos; char** nodes; @@ -216,7 +457,15 @@ static int lsf_read_resource(unifyfs_resource_t* resource) } } - lsb_hosts = strdup(val); + // LSB_MCPU_HOSTS string includes a space at the end, which causes extra + // node count (n_nodes). + lsb_hosts = str_rtrim(strdup(val)); + + // get length of host string + size_t hosts_len = strlen(lsb_hosts) + 1; + + // pointer to character just past terminating NULL + char* hosts_end = lsb_hosts + hosts_len; // replace spaces with zeroes for (pos = lsb_hosts; *pos; pos++) { @@ -232,7 +481,7 @@ static int lsf_read_resource(unifyfs_resource_t* resource) } else { pos += (strlen(pos) + 1); // skip launch node slot count } - for (n_nodes = 0; *pos;) { + for (n_nodes = 0; pos < hosts_end;) { node = pos; if (!mcpu) { if (strcmp(last_node, node) != 0) { @@ -259,7 +508,7 @@ static int lsf_read_resource(unifyfs_resource_t* resource) } else { pos += (strlen(pos) + 1); // skip launch node slot count } - for (i = 0; *pos && i < n_nodes;) { + for (i = 0; pos < hosts_end && i < n_nodes;) { node = pos; if (!mcpu) { if (strcmp(last_node, node) != 0) { @@ -363,8 +612,16 @@ static int slurm_read_resource(unifyfs_resource_t* resource) return ret; } +// construct_server_argv(): +// This function is called in two ways. +// Call it once with server_argv==NULL and it +// will count up the number of arguments you'll have, but +// doesn't construct the list itself. Call it again with +// the same args but with a buffer in server_argv, and it will +// construct the argument list there. /** - * @brief Default server launch routine + * @brief Constructs argument chain to mpi-start (or terminate) + * unifyfs server processes. * * @param args The command-line options * @param server_args Server argument vector to be filled @@ -430,6 +687,51 @@ static size_t construct_server_argv(unifyfs_args_t* args, return argc; } +// construct_stage_argv: +// this is currently set up to create one rank per compute node, +// mirroring the configuration of the servers. However, in the +// future, this may be reconfigured to have more, to support +// more files being staged in or out more quickly. +/** + * @brief Constructs argument chain to mpi-start (or terminate) + * unifyfs-stage stagein/out process. + * + * @param args The command-line options + * @param stage_args unifyfs-stage argument vector to be filled + * + * @return number of server arguments + */ +static size_t construct_stage_argv(unifyfs_args_t* args, + char** stage_argv) +{ + size_t argc = 0; + + if (stage_argv != NULL) { + stage_argv[0] = strdup(LIBEXECDIR "/unifyfs-stage"); + } + argc = 1; + + if (args->mountpoint != NULL) { + if (stage_argv != NULL) { + stage_argv[argc] = strdup("-m"); + stage_argv[argc + 1] = strdup(args->mountpoint); + } + argc += 2; + } + + if (stage_argv != NULL) { + char* manifest_file = args->stage_in ? args->stage_in + : args->stage_out; + + stage_argv[argc] = strdup("-s"); + stage_argv[argc + 1] = strdup(args->share_dir); + stage_argv[argc + 2] = strdup(manifest_file); + } + argc += 3; + + return argc; +} + /** * @brief Default server launch routine * @@ -458,6 +760,20 @@ static int invalid_terminate(unifyfs_resource_t* resource, return -ENOSYS; } +/** + * @brief Default data stage routine + * + * @param resource Not used + * @param args Not used + * + * @return -ENOSYS + */ +static int invalid_stage(unifyfs_resource_t* resource, + unifyfs_args_t* args) +{ + return -ENOSYS; +} + /** * @brief Launch servers using IBM jsrun * @@ -474,7 +790,7 @@ static int jsrun_launch(unifyfs_resource_t* resource, char n_nodes[16]; // full command: jsrun - jsrun_argc = 9; + jsrun_argc = 13; snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); server_argc = construct_server_argv(args, NULL); @@ -486,11 +802,15 @@ static int jsrun_launch(unifyfs_resource_t* resource, argv[1] = strdup("--immediate"); argv[2] = strdup("-e"); argv[3] = strdup("individual"); - argv[4] = strdup("--nrs"); - argv[5] = strdup(n_nodes); - argv[6] = strdup("-r1"); - argv[7] = strdup("-c1"); - argv[8] = strdup("-a1"); + argv[4] = strdup("--stdio_stderr"); + argv[5] = strdup("unifyfsd.err.%h.%p"); + argv[6] = strdup("--stdio_stdout"); + argv[7] = strdup("unifyfsd.out.%h.%p"); + argv[8] = strdup("--nrs"); + argv[9] = strdup(n_nodes); + argv[10] = strdup("-r1"); + argv[11] = strdup("-c1"); + argv[12] = strdup("-a1"); construct_server_argv(args, argv + jsrun_argc); execvp(argv[0], argv); @@ -538,6 +858,50 @@ static int jsrun_terminate(unifyfs_resource_t* resource, return -errno; } +/** + * @brief Launch data stage using IBM jsrun + * + * @param resource The job resource record + * @param args The command-line options + * + * @return + */ +static int jsrun_stage(unifyfs_resource_t* resource, + unifyfs_args_t* args) +{ + size_t argc, jsrun_argc, stage_argc; + char** argv = NULL; + char n_nodes[16]; + + // full command: jsrun + jsrun_argc = 13; + snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); + + stage_argc = construct_stage_argv(args, NULL); + + // setup full command argv + argc = 1 + jsrun_argc + stage_argc; + argv = calloc(argc, sizeof(char*)); + argv[0] = strdup("jsrun"); + argv[1] = strdup("--immediate"); + argv[2] = strdup("-e"); + argv[3] = strdup("individual"); + argv[4] = strdup("--stdio_stderr"); + argv[5] = strdup("unifyfs-stage.err.%h.%p"); + argv[6] = strdup("--stdio_stdout"); + argv[7] = strdup("unifyfs-stage.out.%h.%p"); + argv[8] = strdup("--nrs"); + argv[9] = strdup(n_nodes); + argv[10] = strdup("-r1"); + argv[11] = strdup("-c1"); + argv[12] = strdup("-a1"); + construct_stage_argv(args, argv + jsrun_argc); + + execvp(argv[0], argv); + perror("failed to execvp() mpirun to handle data stage"); + return -errno; +} + /** * @brief Launch servers using mpirun (OpenMPI) * @@ -611,6 +975,42 @@ static int mpirun_terminate(unifyfs_resource_t* resource, return -errno; } +/** + * @brief Launch unifyfs-stage using mpirun (OpenMPI) + * + * @param resource The job resource record + * @param args The command-line options + * + * @return + */ +static int mpirun_stage(unifyfs_resource_t* resource, + unifyfs_args_t* args) +{ + size_t argc, mpirun_argc, stage_argc; + char** argv = NULL; + char n_nodes[16]; + + // full command: mpirun + + mpirun_argc = 5; + snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); + + stage_argc = construct_stage_argv(args, NULL); + + // setup full command argv + argc = 1 + mpirun_argc + stage_argc; + argv = calloc(argc, sizeof(char*)); + argv[0] = strdup("mpirun"); + argv[1] = strdup("-np"); + argv[2] = strdup(n_nodes); + argv[3] = strdup("--map-by"); + argv[4] = strdup("ppr:1:node"); + construct_stage_argv(args, argv + mpirun_argc); + + execvp(argv[0], argv); + perror("failed to execvp() mpirun to handle data stage"); + return -errno; +} /** * @brief Launch servers using SLURM srun @@ -685,6 +1085,43 @@ static int srun_terminate(unifyfs_resource_t* resource, return -errno; } +/** + * @brief Launch unifyfs-stage using SLURM srun + * + * @param resource The job resource record + * @param args The command-line options + * + * @return + */ +static int srun_stage(unifyfs_resource_t* resource, + unifyfs_args_t* args) +{ + size_t argc, srun_argc, stage_argc; + char** argv = NULL; + char n_nodes[16]; + + // full command: srun + + srun_argc = 5; + snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); + + stage_argc = construct_stage_argv(args, NULL); + + // setup full command argv + argc = 1 + srun_argc + stage_argc; + argv = calloc(argc, sizeof(char*)); + argv[0] = strdup("srun"); + argv[1] = strdup("-N"); + argv[2] = strdup(n_nodes); + argv[3] = strdup("--ntasks-per-node"); + argv[4] = strdup("1"); + construct_stage_argv(args, argv + srun_argc); + + execvp(argv[0], argv); + perror("failed to execvp() srun to launch unifyfsd"); + return -errno; +} + /** * @brief Launch servers using custom script * @@ -756,11 +1193,41 @@ static int script_terminate(unifyfs_resource_t* resource, * match the definition in common/src/rm_enumerator.h */ static _ucr_resource_manager_t resource_managers[] = { - { "none", &invalid_read_resource, &invalid_launch, &invalid_terminate }, - { "pbs", &pbs_read_resource, &mpirun_launch, &mpirun_terminate }, - { "slurm", &slurm_read_resource, &srun_launch, &srun_terminate }, - { "lsf", &lsf_read_resource, &mpirun_launch, &mpirun_terminate }, - { "lsfcsm", &lsf_read_resource, &jsrun_launch, &jsrun_terminate }, + { + .type = "none", + .read_resource = &invalid_read_resource, + .launch = &invalid_launch, + .terminate = &invalid_terminate, + .stage = &invalid_stage, + }, + { + .type = "pbs", + .read_resource = &pbs_read_resource, + .launch = &mpirun_launch, + .terminate = &mpirun_terminate, + .stage = &mpirun_stage, + }, + { + .type = "slurm", + .read_resource = &slurm_read_resource, + .launch = &srun_launch, + .terminate = &srun_terminate, + .stage = &srun_stage, + }, + { + .type = "lsf", + .read_resource = &lsf_read_resource, + .launch = &mpirun_launch, + .terminate = &mpirun_terminate, + .stage = &mpirun_stage, + }, + { + .type = "lsfcsm", + .read_resource = &lsf_read_resource, + .launch = &jsrun_launch, + .terminate = &jsrun_terminate, + .stage = &jsrun_stage, + }, }; int unifyfs_detect_resources(unifyfs_resource_t* resource) @@ -786,6 +1253,7 @@ int unifyfs_start_servers(unifyfs_resource_t* resource, unifyfs_args_t* args) { int rc; + pid_t pid; if ((resource == NULL) || (args == NULL)) { return -EINVAL; @@ -793,25 +1261,93 @@ int unifyfs_start_servers(unifyfs_resource_t* resource, rc = write_hostfile(resource, args); if (rc) { - fprintf(stderr, "ERROR: failed to write shared server hostfile\n"); + fprintf(stderr, "Failed to write shared server hostfile!\n"); return rc; } - if (args->script != NULL) { - return script_launch(resource, args); - } else { - return resource_managers[resource->rm].launch(resource, args); + rc = remove_server_pid_file(args); + if (rc) { + fprintf(stderr, "Failed to remove server pid file!\n"); + return rc; + } + + pid = fork(); + if (pid < 0) { + fprintf(stderr, "Failed to create server launch process (%s)\n", + strerror(errno)); + return -errno; + } else if (pid == 0) { + if (args->script != NULL) { + return script_launch(resource, args); + } else { + return resource_managers[resource->rm].launch(resource, args); + } } + + rc = wait_server_initialization(resource, args); + if (rc) { + fprintf(stderr, "Failed to wait for server initialization\n"); + } + + if (args->stage_in) { + rc = remove_stage_status_file(args); + if (rc) { + fprintf(stderr, "Failed to remove stage status file\n"); + return rc; + } + + pid = fork(); + if (pid < 0) { + fprintf(stderr, "failed to create stage-in launch process (%s)\n", + strerror(errno)); + return -errno; + } else if (pid == 0) { + return resource_managers[resource->rm].stage(resource, args); + } + + rc = wait_stage(resource, args); + if (rc) { + fprintf(stderr, "failed to detect the stage in status (rc=%d)\n", + rc); + } + } + + return rc; } int unifyfs_stop_servers(unifyfs_resource_t* resource, unifyfs_args_t* args) { + int rc; + pid_t pid; if ((resource == NULL) || (args == NULL)) { return -EINVAL; } + if (args->stage_out) { + rc = remove_stage_status_file(args); + if (rc) { + fprintf(stderr, "Failed to remove stage status file\n"); + return rc; + } + + pid = fork(); + if (pid < 0) { + fprintf(stderr, "failed to create stage-out launch process (%s)\n", + strerror(errno)); + return -errno; + } else if (pid == 0) { + return resource_managers[resource->rm].stage(resource, args); + } + + rc = wait_stage(resource, args); + if (rc) { + fprintf(stderr, "failed to detect the data out status (rc=%d)\n", + rc); + } + } + if (args->script != NULL) { return script_terminate(resource, args); } else { diff --git a/util/unifyfs/src/unifyfs.c b/util/unifyfs/src/unifyfs.c index f77801157..23438b71d 100644 --- a/util/unifyfs/src/unifyfs.c +++ b/util/unifyfs/src/unifyfs.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "unifyfs.h" @@ -75,11 +76,13 @@ static struct option const long_opts[] = { { "share-dir", required_argument, NULL, 'S' }, { "stage-in", required_argument, NULL, 'i' }, { "stage-out", required_argument, NULL, 'o' }, + { "timeout", required_argument, NULL, 't' }, + { "stage-timeout", required_argument, NULL, 'T' }, { 0, 0, 0, 0 }, }; static char* program; -static char* short_opts = ":cC:de:hi:m:o:s:S:"; +static char* short_opts = ":cC:de:hi:m:o:s:S:t:T:"; static char* usage_str = "\n" "Usage: %s [options...]\n" @@ -93,17 +96,21 @@ static char* usage_str = " -h, --help print usage\n" "\n" "Command options for \"start\":\n" - " -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX)\n" - " -e, --exe= [OPTIONAL] where unifyfsd is installed\n" - " -m, --mount= [OPTIONAL] mount UnifyFS at \n" - " -s, --script= [OPTIONAL] to custom launch script\n" - " -S, --share-dir= [REQUIRED] shared file system for use by servers\n" - " -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit\n" - " -i, --stage-in= [OPTIONAL, NOT YET SUPPORTED] stage in file(s) at \n" - " -o, --stage-out= [OPTIONAL, NOT YET SUPPORTED] stage out file(s) to on termination\n" + " -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX)\n" + " -e, --exe= [OPTIONAL] where unifyfsd is installed\n" + " -m, --mount= [OPTIONAL] mount UnifyFS at \n" + " -s, --script= [OPTIONAL] to custom launch script\n" + " -t, --timeout= [OPTIONAL] wait until all servers become ready\n" + " -S, --share-dir= [REQUIRED] shared file system for use by servers\n" + " -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit\n" + " -i, --stage-in= [OPTIONAL] stage in file(s) listed in file\n" + " -T, --stage-timeout= [OPTIONAL] timeout for stage-in operation\n" "\n" "Command options for \"terminate\":\n" - " -s, --script= to custom termination script\n" + " -o, --stage-out= [OPTIONAL] stage out file(s) listed in on termination\n" + " -T, --stage-timeout= [OPTIONAL] timeout for stage-out operation\n" + " -s, --script= [OPTIONAL] to custom termination script\n" + " -S, --share-dir= [REQUIRED for --stage-out] shared file system for use by servers\n" "\n"; static int debug; @@ -119,6 +126,8 @@ static void parse_cmd_arguments(int argc, char** argv) int ch = 0; int optidx = 2; int cleanup = 0; + int timeout = UNIFYFS_DEFAULT_INIT_TIMEOUT; + int stage_timeout = -1; unifyfs_cm_e consistency = UNIFYFS_CM_LAMINATED; char* mountpoint = NULL; char* script = NULL; @@ -127,6 +136,8 @@ static void parse_cmd_arguments(int argc, char** argv) char* stage_in = NULL; char* stage_out = NULL; + int argument_count = 1; + while ((ch = getopt_long(argc, argv, short_opts, long_opts, &optidx)) >= 0) { switch (ch) { @@ -162,21 +173,32 @@ static void parse_cmd_arguments(int argc, char** argv) share_dir = strdup(optarg); break; + case 't': + timeout = atoi(optarg); + break; + + case 'T': + stage_timeout = atoi(optarg); + break; + case 'i': - printf("WARNING: stage-in not yet supported!\n"); stage_in = strdup(optarg); break; case 'o': - printf("WARNING: stage-out not yet supported!\n"); stage_out = strdup(optarg); break; case 'h': - default: usage(0); break; + + default: + printf("\n\nArgument %d is invalid!\n", argument_count); + usage(-EINVAL); + break; } + argument_count++; } cli_args.debug = debug; @@ -188,6 +210,8 @@ static void parse_cmd_arguments(int argc, char** argv) cli_args.share_dir = share_dir; cli_args.stage_in = stage_in; cli_args.stage_out = stage_out; + cli_args.stage_timeout = stage_timeout; + cli_args.timeout = timeout; } int main(int argc, char** argv) @@ -229,6 +253,7 @@ int main(int argc, char** argv) printf("server:\t%s\n", cli_args.server_path); printf("stage_in:\t%s\n", cli_args.stage_in); printf("stage_out:\t%s\n", cli_args.stage_out); + printf("stage_timeout:\t%d\n", cli_args.stage_timeout); } ret = unifyfs_detect_resources(&resource); @@ -250,9 +275,47 @@ int main(int argc, char** argv) if (NULL == cli_args.share_dir) { printf("USAGE ERROR: shared directory (-S) is required!\n"); usage(1); + return -EINVAL; + } + if (cli_args.stage_in != NULL) { + if (cli_args.script) { + fprintf(stderr, + "WARNING! You are using a script with --stage-in.\n" + "This will not work.\n"); + return -EINVAL; + } + if (access(cli_args.stage_in, R_OK)) { + fprintf(stderr, + "Cannot read stagein manifest file:%s\n", + cli_args.stage_in); + return -ENOENT; + } } return unifyfs_start_servers(&resource, &cli_args); } else if (action == ACT_TERMINATE) { + if (cli_args.stage_out != NULL) { + // status directory isn't required just to terminate the servers + // but it IS required if we're calling for stage-out, so that + // the stage-out can be started, then the server terminate waits + // until the stage-out is done. + if (NULL == cli_args.share_dir) { + printf("USAGE ERROR: shared directory (-S) is required!\n"); + usage(1); + return -EINVAL; + } + if (cli_args.script) { + fprintf(stderr, + "WARNING! You are using a script with --stage-out.\n" + "This will not work.\n"); + return -EINVAL; + } + if (access(cli_args.stage_out, R_OK)) { + fprintf(stderr, + "Cannot read stageout manifest file:%s\n", + cli_args.stage_out); + return -ENOENT; + } + } return unifyfs_stop_servers(&resource, &cli_args); } else { fprintf(stderr, "INTERNAL ERROR: unhandled action %d\n", (int)action); diff --git a/util/unifyfs/src/unifyfs.h b/util/unifyfs/src/unifyfs.h index 51917d366..a14e99592 100644 --- a/util/unifyfs/src/unifyfs.h +++ b/util/unifyfs/src/unifyfs.h @@ -57,6 +57,7 @@ struct _unifyfs_args { int debug; /* enable debug output */ int cleanup; /* cleanup on termination? (0 or 1) */ + int timeout; /* timeout of server initialization */ unifyfs_cm_e consistency; /* consistency model */ char* mountpoint; /* mountpoint */ char* server_path; /* full path to installed unifyfsd */ @@ -64,6 +65,7 @@ struct _unifyfs_args { char* share_hostfile; /* full path to shared server hostfile */ char* stage_in; /* data path to stage-in */ char* stage_out; /* data path to stage-out (drain) */ + int stage_timeout; /* timeout of (in or out) file staging*/ char* script; /* path to custom launch/terminate script */ }; typedef struct _unifyfs_args unifyfs_args_t;