Skip to content

Commit

Permalink
add append option
Browse files Browse the repository at this point in the history
consolidate object helpers into a single function to allow skipping existing objects
  • Loading branch information
MRossol committed May 27, 2020
1 parent 32df289 commit a84d408
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 58 deletions.
17 changes: 12 additions & 5 deletions h5pyd/_apps/hsload.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def usage():
print(" -e | --endpoint <domain> :: The HDF Server endpoint, e.g. http://hsdshdflab.hdfgroup.org")
print(" -u | --user <username> :: User name credential")
print(" -p | --password <password> :: Password credential")
print(" -a | --append <mode> :: Flag to append to an existing HDF Server domain")
print(" -c | --conf <file.cnf> :: A credential and config file")
print(" -z[n] :: apply compression filter to any non-compressed datasets, n: [0-9]")
print(" --cnf-eg :: Print a config file and then exit")
Expand Down Expand Up @@ -135,6 +136,7 @@ def main():
logfname=None
ipvfam=None
s3 = None # s3fs instance
mode = ''

src_files = []
argn = 1
Expand All @@ -147,8 +149,10 @@ def main():
sys.stderr.write("options must precede source files")
usage()
sys.exit(-1)

if len(sys.argv) > argn + 1:
val = sys.argv[argn+1]
val = sys.argv[argn + 1]

if arg in ("-v", "--verbose"):
verbose = True
argn += 1
Expand Down Expand Up @@ -200,6 +204,9 @@ def main():
elif arg in ("-p", "--password"):
cfg["hs_password"] = val
argn += 2
elif arg in ("-a", "--append"):
mode = 'a'
argn += 1
elif arg == '--cnf-eg':
print_config_example()
sys.exit(0)
Expand Down Expand Up @@ -263,15 +270,15 @@ def main():

for src_file in src_files:
# check if this is a non local file, if it is remote (http, etc...) stage it first then insert it into hsds
src_file_chk = urlparse(src_file)
src_file_chk = urlparse(src_file)
logging.debug(src_file_chk)

if src_file_chk.scheme == 'http' or src_file_chk.scheme == 'https' or src_file_chk.scheme == 'ftp':
src_file = stage_file(src_file, netfam=ipvfam)
if src_file == None:
if src_file is None:
continue
istmp = True
logging.info('temp source data: '+str(src_file))
logging.info('temp source data: ' + str(src_file))
else:
istmp = False

Expand Down Expand Up @@ -315,7 +322,7 @@ def main():
endpoint = cfg["hs_endpoint"]
bucket = cfg["hs_bucket"]

fout = h5pyd.File(tgt, 'a', endpoint=endpoint, username=username, password=password, bucket=bucket)
fout = h5pyd.File(tgt, mode, endpoint=endpoint, username=username, password=password, bucket=bucket)
except IOError as ioe:
if ioe.errno == 404:
logging.error("Domain: {} not found".format(tgt))
Expand Down
92 changes: 39 additions & 53 deletions h5pyd/_apps/utillib.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,8 +532,8 @@ def create_links(gsrc, gdes, ctx):
gdes[title] = des_obj
else:
# TBD - in hdf5 1.10 it seems that two references to the same object
# can return different id's. This will cause HDF5 files with
# multilinks to not load correctly
# can return different id's. This will cause HDF5 files with
# multilinks to not load correctly
msg = "could not find map item to src id: {}".format(src_obj_id_hash)
logging.warn(msg)
if ctx["verbose"]:
Expand Down Expand Up @@ -622,83 +622,69 @@ def load_file(fin, fout, verbose=False, dataload="ingest", s3path=None, deflate=
ctx["s3path"] = s3path
ctx["srcid_desobj_map"] = {}


# create any root attributes
for ga in fin.attrs:
copy_attribute(fout, ga, fin, ctx)

def object_create_helper(name, obj):
# create root soft/external links
create_links(fin, fout, ctx)

def object_helper(name, obj):
if name in fout:
logger.warning('{} already exists and will be skipped'
.format(name))
else:
class_name = obj.__class__.__name__

if class_name in ("Dataset", "Table"):
create_dataset(obj, ctx)
dset = self.create_dataset(name, obj)

if dset is not None:
for da in obj.attrs:
self.copy_attribute(dset, da, obj)

if dataload == "ingest":
logging.debug("object_copy_helper for object: {}".format(obj.name))
if ctx["dataload"] == "link":
logging.info("skip datacopy for link reference")
else:
logging.debug("calling write_dataset for dataset: {}".format(obj.name))
tgt = fout[obj.name]
write_dataset(obj, tgt, ctx)

elif class_name == "Group":
create_group(obj, ctx)
grp = self.create_group(name, obj)

if grp is not None:
for ga in obj.attrs:
self.copy_attribute(grp, ga, obj)

# create any soft/external links
logging.debug("object_link_helper for object: {}".format(obj.name))
fout = ctx["fout"]
grp = fout[name]
create_links(obj, grp, ctx)
elif class_name == "Datatype":
create_datatype(obj, ctx)
self.create_datatype(obj)
else:
logging.error("no handler for object class: {}".format(type(obj)))

def object_link_helper(name, obj):
class_name = obj.__class__.__name__
logging.debug("object_link_helper for object: {}".format(obj.name))
if class_name == "Group":
# create any soft/external links
fout = ctx["fout"]
grp = fout[name]
create_links(obj, grp, ctx)

def object_copy_helper(name, obj):
class_name = obj.__class__.__name__
logging.debug("object_copy_helper for object: {}".format(obj.name))
if class_name in ("Dataset", "Table"):
if ctx["dataload"] == "link":
logging.info("skip datacopy for link reference")
else:
logging.debug("calling write_dataset for dataset: {}".format(obj.name))
tgt = fout[obj.name]
write_dataset(obj, tgt, ctx)
elif class_name == "Group":
logging.debug("skip copy for group: {}".format(obj.name))
elif class_name == "Datatype":
logging.debug("skip copy for datatype: {}".format(obj.name))
else:
logging.error("no handler for object class: {}".format(type(obj)))

def object_attribute_helper(name, obj):
tgt = fout[obj.name]
for ga in obj.attrs:
copy_attribute(tgt, ga, obj, ctx)
logger.error("no handler for object class: {}"
.format(type(obj)))

# build a rough map of the file using the internal function above
logging.info("creating target objects")
fin.visititems(object_create_helper)

# copy over any attributes
logging.info("creating target attributes")
fin.visititems(object_attribute_helper)

# create soft/external links (and hardlinks not already created)
create_links(fin, fout, ctx) # create root soft/external links
fin.visititems(object_link_helper)

logging.info("creating target objects and attributes")
if dataload == "ingest":
# copy dataset data
logging.info("copying dataset data")
fin.visititems(object_copy_helper)
else:
logging.info("skipping dataset data copy (dataload is None or 'link')")

fin.visititems(object_helper)

# Fully flush the h5py handle.
fout.close()

# close up the source domain, see reason(s) for this below
fin.close()
msg="load_file complete"
msg = "load_file complete"
logging.info(msg)
if verbose:
print(msg)
Expand Down

0 comments on commit a84d408

Please sign in to comment.