Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update gscholar.py, now it can filter start year and end year, also added set_cookie from response to cookie requests also added a delay option for requesting. #44

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions gscholar/gscholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,17 @@
import os
import subprocess
import logging
from time import sleep


GOOGLE_SCHOLAR_URL = "https://scholar.google.com"
HEADERS = {'User-Agent': 'Mozilla/5.0'}
HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0',
'Accept-Language': 'en-US',
# 'Accept-Encoding': 'gzip, deflate',
'Accept' : 'text/html',
'Referer' : 'https://scholar.google.com/'
}

FORMAT_BIBTEX = 4
FORMAT_ENDNOTE = 3
Expand All @@ -28,7 +35,7 @@
logger = logging.getLogger(__name__)


def query(searchstr, outformat=FORMAT_BIBTEX, allresults=False):
def query(searchstr, outformat=FORMAT_BIBTEX, delay=0.1, allresults=False, start_year='', end_year=''):
"""Query google scholar.

This method queries google scholar and returns a list of citations.
Expand All @@ -41,6 +48,12 @@ def query(searchstr, outformat=FORMAT_BIBTEX, allresults=False):
the output format of the citations. Default is bibtex.
allresults : bool, optional
return all results or only the first (i.e. best one)
delay: float, optional
delay for requesting, to not getting banned by google for to much requests in a short time
start_year: str, optional
4 number integer representing the start year of papers like 2020
end_year: str, optional
4 number integer representing the end year of papers like 2022

Returns
-------
Expand All @@ -49,12 +62,16 @@ def query(searchstr, outformat=FORMAT_BIBTEX, allresults=False):

"""
logger.debug("Query: {sstring}".format(sstring=searchstr))
searchstr = '/scholar?q='+quote(searchstr)
# searchstr = '/scholar?q='+quote(searchstr)
searchstr = f'/scholar?q={quote(searchstr)}&as_ylo={quote(start_year)}&as_yhi={quote(end_year)}'
url = GOOGLE_SCHOLAR_URL + searchstr
header = HEADERS
header['Cookie'] = "GSP=CF=%d" % outformat
request = Request(url, headers=header)
response = urlopen(request)
# add set_cookie in header in request header!
set_cookie = response.headers['Set-Cookie']
header['Cookie'] += set_cookie
html = response.read()
html = html.decode('utf8')
# grab the links
Expand All @@ -65,6 +82,7 @@ def query(searchstr, outformat=FORMAT_BIBTEX, allresults=False):
if not allresults:
tmp = tmp[:1]
for link in tmp:
sleep(delay)
url = GOOGLE_SCHOLAR_URL+link
request = Request(url, headers=header)
response = urlopen(request)
Expand Down