You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
In trying to assess the total number of citations for the ESS-DIVE repository, it was discovered that the citation count is off when querying by repository vs querying citations by individual datasets.
Citations from 1/1/2016 to 11/30/2021
Repository-wide query
235 Total Count Returned (totalCitations)
216 Total Citations Returned (citations)
71 Unique Citations (from citations)
Dataset-level query
330 Total Citations
121 Data packages with citations
217 Unique Citations
Python code
The following python code was used to generate the counts above. You need to install pandas and requests libraries. This code was executed in a Jupyter notebook
importrequestsimportjson# Import pandas library importpandasaspdpd.set_option('display.max_colwidth', None)
fromipywidgetsimportwidgets, interactfromIPython.displayimportdisplay# Setup the inputsfrom_date=widgets.Text("01/01/2016", description="From Date:")
to_date=widgets.Text("09/30/2021", description="To Date:")
display(from_date)
display(to_date)
defget_repo_citations(to_date, from_date="01/01/2016"):
""" Repository level citations from the metrics service. IMPORTANT: These have been found to be incomplete when compared to the individual doi queries for citations. """metrics_request_json= {
"metricsPage": {
"total": 0,
"start": 0,
"count": 0
},
"metrics": [
"citations",
"downloads",
"views"
],
"filterBy": [
{
"filterType": "repository",
"values": [
"urn:node:ESS_DIVE"
],
"interpretAs": "list"
},
{
"filterType": "month",
"values": [
from_date,
to_date
],
"interpretAs": "range"
}
],
"groupBy": [
"month"
]
}
metrics_request=json.dumps(metrics_request_json)
metrics_response=requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")
repository_results=metrics_response.json()['resultDetails']
repo_citations={}
forcinrepository_results['citations']:
citation=repository_results['citations'][c]
fortincitation['target_id']:
repo_citations.setdefault(f"doi:{t}", set())
repo_citations[f"doi:{t}"].add(c)
returnrepo_citations, repository_results['citations'], repository_results['resultDetails']['totalCitations']
defget_citations(to_date, from_date="01/01/2016"):
""" Get the citations for the specified date range returns tuple (dataframe, dictionary of citations) """# Prepare the data framedf=pd.DataFrame(columns=['citations', 'doi', 'title'])
#IMPORTANT must use archive=* to get all archived and current data packagesresponse=requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows=0&archived=*")
max_rows=response.json()['response']['numFound']
print(f"{max_rows} datasets found.")
# query ESS-DIVE and the metrics service to get the data package citations# TODO: this should be updated to page over the results if it is over 400response=requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows={max_rows}&archived=*")
response_json=response.json()
individual_citations=dict()
# Iterator over datasets and query the metrics service for citationsfordinresponse_json['response']['docs']:
series_id='seriesId'indandd['seriesId'] ord['id']
title=d['title']
metrics_request_json= {"metricsPage":{"total":0,"start":0,"count":0},
"metrics":["citations","downloads","views"],
"filterBy":[{"filterType":"dataset","values":[series_id],"interpretAs":"list"},{"filterType":"month","values":[from_date,to_date],"interpretAs":"range"}],
"groupBy":["month"]}
metrics_request=json.dumps(metrics_request_json)
metrics_response=requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")
# Get the citations from the metrics responseunique_citations= [c['source_id'] forcinmetrics_response.json()['resultDetails']['citations']]
print(f"{len(unique_citations)} ", end="")
# append to data framedf=df.append({'citations': len(unique_citations),
'doi': series_id,
'title': d['title']}, ignore_index=True)
individual_citations[series_id]=set(un
Counts the citations
# Dataset level citationsdf, individual_citations=get_citations(to_date.value, from_date=from_date.value)
has_citations=df['citations']>0df_has_citations=df[has_citations]
print(df_has_citations.shape[0])
df_has_citations=df_has_citations.sort_values(by=['citations'], ascending=False).head(df_has_citations.shape[0])
fromIPython.displayimportdisplay, HTMLdisplay(HTML(df_has_citations.to_html(index=False)))
# Repository-wide citations unique_repo_citations, repo_query_result, total_citations=get_repo_citations(to_date.value, from_date=from_date.value)
unique=set()
forcinindividual_citations:
unique.update(individual_citations[c])
print("Repository-wide query")
print(f" {total_citations} Total Count Returned (totalCitations)")
print(f" {len(repo_query_result)} Total Citations Returned (citations)")
print(f" {len(unique_repo_citations)} Unique Citations (from citations)")
print("Dataset-level query")
print(f" {df_has_citations['citations'].sum()} Total Citations")
print(f" {df_has_citations['citations'].count()} Data packages with citations")
print(f" {len(unique)} Unique Citations")
The text was updated successfully, but these errors were encountered:
The ES identifiers index is not up to date with the datasetIdentifierFamily information. I'm working on getting that up to date. Once we have proper datasetIdentifierFamily index, we'll be able to index the identifiers in the citation metadata table. This applies to both repository level citation metrics and portal level citation metrics
In trying to assess the total number of citations for the ESS-DIVE repository, it was discovered that the citation count is off when querying by repository vs querying citations by individual datasets.
Citations from 1/1/2016 to 11/30/2021
Python code
The following python code was used to generate the counts above. You need to install
pandas
andrequests
libraries. This code was executed in a Jupyter notebookCounts the citations
The text was updated successfully, but these errors were encountered: