Skip to content

Commit

Permalink
dealing with missing data
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben Elsworth committed May 21, 2021
1 parent 5706e49 commit e89961b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
8 changes: 5 additions & 3 deletions create/index_semmeddb_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def index_sentence_data(sentence_data, index_name):
"PYEAR"
]
df.columns = col_names
df.fillna('NA',inplace=True)
logger.info(f"\n{df.head()}")
logger.info(df.shape)

Expand All @@ -99,15 +100,16 @@ def index_sentence_data(sentence_data, index_name):
)
bulk_data = []
# print(line.decode('utf-8'))
PMID = row['PMID'].replace("'", "")
if PMID in pmids:
#PMID = row['PMID'].replace("'", "")
if str(row['PMID']) in pmids:
data_dict = {
"PMID": PMID,
"PMID": row['PMID'],
"ISSN": row['ISSN'],
"DP": row['DP'],
"EDAT": row['EDAT'],
"PYEAR": int(row['PYEAR']),
}

op_dict = {
"_index": index_name,
# "_id": l[0],
Expand Down
3 changes: 2 additions & 1 deletion create/index_semmeddb_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def index_sentence_data(sentence_data, index_name):
]
df.columns = col_names
df.drop(columns=["SECTION_HEADER", "NORMALIZED_SECTION_HEADER"], inplace=True)
df.dropna(inplace=True)
#df.dropna(inplace=True)
df.fillna('NA',inplace=True)
logger.info(f"\n{df.head()}")
logger.info(df.shape)
for i, row in df.iterrows():
Expand Down

0 comments on commit e89961b

Please sign in to comment.