diff --git a/create/index_semmeddb_citations.py b/create/index_semmeddb_citations.py index c3312f0..9e19724 100644 --- a/create/index_semmeddb_citations.py +++ b/create/index_semmeddb_citations.py @@ -77,6 +77,7 @@ def index_sentence_data(sentence_data, index_name): "PYEAR" ] df.columns = col_names + df.fillna('NA',inplace=True) logger.info(f"\n{df.head()}") logger.info(df.shape) @@ -99,15 +100,16 @@ def index_sentence_data(sentence_data, index_name): ) bulk_data = [] # print(line.decode('utf-8')) - PMID = row['PMID'].replace("'", "") - if PMID in pmids: + #PMID = row['PMID'].replace("'", "") + if str(row['PMID']) in pmids: data_dict = { - "PMID": PMID, + "PMID": row['PMID'], "ISSN": row['ISSN'], "DP": row['DP'], "EDAT": row['EDAT'], "PYEAR": int(row['PYEAR']), } + op_dict = { "_index": index_name, # "_id": l[0], diff --git a/create/index_semmeddb_sentences.py b/create/index_semmeddb_sentences.py index 31aa563..09d2bae 100644 --- a/create/index_semmeddb_sentences.py +++ b/create/index_semmeddb_sentences.py @@ -105,7 +105,8 @@ def index_sentence_data(sentence_data, index_name): ] df.columns = col_names df.drop(columns=["SECTION_HEADER", "NORMALIZED_SECTION_HEADER"], inplace=True) - df.dropna(inplace=True) + #df.dropna(inplace=True) + df.fillna('NA',inplace=True) logger.info(f"\n{df.head()}") logger.info(df.shape) for i, row in df.iterrows():