Pold87 · bmascat · Oct 26, 2023 · Oct 26, 2023 · Feb 11, 2024 · Feb 11, 2024
diff --git a/README.md b/README.md
@@ -1,21 +1,23 @@
-# Historic word occurrence in academic papers
+# Occurrence of a list of keywords in google academic. Extraction of search results.
 
 ## Summary
 
-This script extracts the historic word occurrence of a search term in
-academic papers (from Google Scholar). It allows for spotting trends
-in research and analyzing the relevance of a topic over time.
+This script extracts the number of results from a list of search terms in academia (from Google Scholar). It helps to prioritise research niches and where there may be under-researched needs.
+
+It can be useful when focusing a scientific review to see where the most information is to be found.
 
 There is a Python 3 branch (master) and a Python 2 branch (python2).
 
 ## Usage
 
-`python extract_occurrences.py '<keyword>' <start date> <end date>`
+Add the list of keywords you want to search for in `input.csv` and run the script. If you want to search for combinations of words, add a + between them.
+
+`python extract_occurrences.py`
 
-This command lists the number of publications for every year using
-this keyword. The script just searches for articles and excludes
+The script just searches for articles and excludes
 patents and citations.
 
+**visualization.ipynb**: This notebook helps to visualise the scraping results by generating a bar chart.
 
 ### Alternative: Usage with Docker
 
@@ -26,25 +28,24 @@ You can use [Docker](https://www.docker.com/) to run this script, without the ne
 
 ## Example
 
-- Search term: 'bitcoin'
-- Desired time span: 2000 to 2015
-- Command: `python extract_occurrences.py 'bitcoin' 2000 2015`
+- Search terms: 'sarcopenia + {drugs for cancer treatment}'
+- Command: `python extract_occurrences.py`
 - Output: `out.csv`, with the following contents:
 
-| year | results |
+| search_term | results |
 |------|---------
 | ...  |    ...  |	|
-| 2011 |    141  |
-| 2012 |    292  |
-| 2013 |    889  |
-| 2014 |    2370 |
-| 2015 |    2580 |
-
+| sarcopenia+PEMBROlizumab |    1340  |
+| sarcopenia+OSIMERTINIB   |    179   |
+| sarcopenia+NIVOlumab     |    1490  |
+| sarcopenia+ABEMACICLIB   |    77    |
+| sarcopenia+PERTuzumab    |    208   |
 
-![bitcoin chart](https://raw.githubusercontent.com/Pold87/academic-keyword-occurrence/master/bitcoin_chart.png "bitcoin chart")
+![sarcopenia and drugs chart](https://github.com/BreisOne/academic-keyword-occurrence/blob/master/bar_plot_results.jpg "sarcopenia and drugs chart")
 
 ## Credits
 Created by Volker Strobel - [email protected]
+adapted by Brais Bea - [email protected]
 
 If you use this code in academic papers, please cite this repository via Zenodo (http://doi.org/10.5281/zenodo.1218409):
 

diff --git a/bar_plot_results.jpg b/bar_plot_results.jpg
diff --git a/bitcoin_chart.png b/bitcoin_chart.png
diff --git a/extract_occurrences.py b/extract_occurrences.py
@@ -3,35 +3,43 @@
 from urllib.request import Request, build_opener, HTTPCookieProcessor
 from urllib.parse import urlencode
 from http.cookiejar import MozillaCookieJar
-import re, time, sys, urllib
+import re
+import time
+import sys
+import urllib
+import csv
 
-def get_num_results(search_term, start_date, end_date):
+
+def get_num_results(search_term):
     """
     Helper method, sends HTTP request and returns response payload.
     """
 
     # Open website and read html
     user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
-    query_params = { 'q' : search_term, 'as_ylo' : start_date, 'as_yhi' : end_date}
-    url = "https://scholar.google.com/scholar?as_vis=1&hl=en&as_sdt=1,5&" + urllib.parse.urlencode(query_params)
+    query_params = {'q': search_term}
+    url = "https://scholar.google.com/scholar?as_vis=1&hl=en&as_sdt=1,5&" + \
+        urllib.parse.urlencode(query_params)
     opener = build_opener()
     request = Request(url=url, headers={'User-Agent': user_agent})
     handler = opener.open(request)
-    html = handler.read() 
+    html = handler.read()
 
     # Create soup for parsing HTML and extracting the relevant information
     soup = BeautifulSoup(html, 'html.parser')
-    div_results = soup.find("div", {"id": "gs_ab_md"}) # find line 'About x results (y sec)
+    # find line 'About x results (y sec)
+    div_results = soup.find("div", {"id": "gs_ab_md"})
 
     if div_results != None:
 
-        res = re.findall(r'(\d+).?(\d+)?.?(\d+)?\s', div_results.text) # extract number of search results
-
+        # extract number of search results
+        res = re.findall(r'(\d+).?(\d+)?.?(\d+)?\s', div_results.text)
+
         if res == []:
             num_results = '0'
             success = True
         else:
-            num_results = ''.join(res[0]) # convert string to numbe
+            num_results = ''.join(res[0])  # convert string to number
             success = True
 
     else:
@@ -40,36 +48,33 @@ def get_num_results(search_term, start_date, end_date):
 
     return num_results, success
 
-def get_range(search_term, start_date, end_date):
 
-    fp = open("out.csv", 'w')
-    fp.write("year,results\n")
-    print("year,results")
+def get_range(reader):
 
-    for date in range(start_date, end_date + 1):
+    fp = open("out.csv", 'w')
+    fp.write("search_term,results\n")
+    print("search_term, results")
 
-        num_results, success = get_num_results(search_term, date, date)
+    for row in reader:
+        search_term = row[0]
+        num_results, success = get_num_results(search_term)
         if not(success):
             print("It seems that you made to many requests to Google Scholar. Please wait a couple of hours and try again.")
             break
-        year_results = "{0},{1}".format(date, num_results)
-        print(year_results)
-        fp.write(year_results + '\n')
-        time.sleep(0.8)
+        search_term_results = "{0},{1}".format(search_term, num_results)
+        print(search_term_results)
+        fp.write(search_term_results + '\n')
+        time.sleep(1.1)
 
     fp.close()
-
+    
 if __name__ == "__main__":
+    print("******")
+    print("Academic word relevance")
+    print("******")
+    print("")
+    print("Usage: python extract_occurences.py")
 
-    if len(sys.argv) < 3:
-        print("******")
-        print("Academic word relevance")
-        print("******")
-        print("")
-        print("Usage: python extract_occurences.py '<search term>' <start date> <end date>")
-
-    else:
-        search_term = sys.argv[1]
-        start_date = int(sys.argv[2])
-        end_date = int(sys.argv[3])
-        html = get_range(search_term, start_date, end_date)
+    with open("input.csv", mode='r') as file:
+        reader = csv.reader(file)
+        get_range(reader)
diff --git a/input.csv b/input.csv
@@ -0,0 +1,65 @@
+sarcopenia+PEMBROlizumab
+sarcopenia+OSIMERTINIB
+sarcopenia+NIVOlumab
+sarcopenia+ABEMACICLIB
+sarcopenia+EMTANSINA TRASTUZUMAB
+sarcopenia+PERTuzumab
+sarcopenia+PANItumumab
+sarcopenia+ALECTINIB
+sarcopenia+DABRAFENIB
+sarcopenia+NIRAPARIB
+sarcopenia+TRASTuzumab
+sarcopenia+CABOZANTINIB
+sarcopenia+DERUXTECAN TRASTUZUMAB
+sarcopenia+RAMUCIRUMAB
+sarcopenia+ATEZOlizumab
+sarcopenia+BEVACIZUMAB
+sarcopenia+OLAPARIB
+sarcopenia+TRAMETINIB
+sarcopenia+RIBOCICLIB
+sarcopenia+DURVALUMAB
+sarcopenia+LENVATINIB
+sarcopenia+PALBOCICLIB
+sarcopenia+TRIFLURIDINA TIPIRACILO
+sarcopenia+IPILIMUMAB
+sarcopenia+PAZOPANIB
+sarcopenia+CRIZOTINIB
+sarcopenia+TRABECTEDINA
+sarcopenia+AVELumab
+sarcopenia+LORLATINIB
+sarcopenia+ENZALUTAMIDA
+sarcopenia+AFATINIB
+sarcopenia+CAPECITABINA
+sarcopenia+CETUXIMAB
+sarcopenia+PACLitaxel ALBUMINA
+sarcopenia+AFLIBERCEPT
+sarcopenia+SACITUZUMAB GOVITECAN
+sarcopenia+eriBULina
+sarcopenia+PACLITAXEL
+sarcopenia+CLORURO DE SODIO
+sarcopenia+ENCORAFENIB
+sarcopenia+EVEROLIMUS
+sarcopenia+VINORELBINA
+sarcopenia+PEMETREXED
+sarcopenia+BINIMETINIB
+sarcopenia+CARBOPLATINO
+sarcopenia+ABIRATERONA
+sarcopenia+DENOSUMAB
+sarcopenia+EVEROLIMUS
+sarcopenia+FILGRASTIM
+sarcopenia+FLUOROURACILO
+sarcopenia+DOXORUBICINA
+sarcopenia+GEMCITABINA
+sarcopenia+REGORAFENIB
+sarcopenia+SELPERCATINIB
+sarcopenia+REMDESIVIR
+sarcopenia+ENOXAPARINA
+sarcopenia+IRINOTECAN
+sarcopenia+NINTEDANIB
+sarcopenia+PLERIXAFOR
+sarcopenia+ALPELISIB
+sarcopenia+RUCAPARIB
+sarcopenia+ALBÚMINA
+sarcopenia+TEMOZOLOMIDA
+sarcopenia+OXALIPLATINO
+sarcopenia+SOTORASIB
diff --git a/out.csv b/out.csv
@@ -1,12 +1,65 @@
-year,results
-2005,73
-2006,113
-2007,141
-2008,171
-2009,314
-2010,296
-2011,413
-2012,623
-2013,1350
-2014,3100
-2015,3960
+search_term,results
+sarcopenia+PEMBROlizumab,1340
+sarcopenia+OSIMERTINIB,179
+sarcopenia+NIVOlumab,1490
+sarcopenia+ABEMACICLIB,77
+sarcopenia+EMTANSINA TRASTUZUMAB,89
+sarcopenia+PERTuzumab,208
+sarcopenia+PANItumumab,237
+sarcopenia+ALECTINIB,67
+sarcopenia+DABRAFENIB,164
+sarcopenia+NIRAPARIB,67
+sarcopenia+TRASTuzumab,1270
+sarcopenia+CABOZANTINIB,287
+sarcopenia+DERUXTECAN TRASTUZUMAB,54
+sarcopenia+RAMUCIRUMAB,255
+sarcopenia+ATEZOlizumab,673
+sarcopenia+BEVACIZUMAB,1670
+sarcopenia+OLAPARIB,235
+sarcopenia+TRAMETINIB,189
+sarcopenia+RIBOCICLIB,83
+sarcopenia+DURVALUMAB,345
+sarcopenia+LENVATINIB,447
+sarcopenia+PALBOCICLIB,176
+sarcopenia+TRIFLURIDINA TIPIRACILO,0
+sarcopenia+IPILIMUMAB,857
+sarcopenia+PAZOPANIB,328
+sarcopenia+CRIZOTINIB,210
+sarcopenia+TRABECTEDINA,107
+sarcopenia+AVELumab,267
+sarcopenia+LORLATINIB,28
+sarcopenia+ENZALUTAMIDA,466
+sarcopenia+AFATINIB,244
+sarcopenia+CAPECITABINA,2330
+sarcopenia+CETUXIMAB,807
+sarcopenia+PACLitaxel ALBUMINA,1100
+sarcopenia+AFLIBERCEPT,109
+sarcopenia+SACITUZUMAB GOVITECAN,34
+sarcopenia+eriBULina,98
+sarcopenia+PACLITAXEL,3050
+sarcopenia+CLORURO DE SODIO,442
+sarcopenia+ENCORAFENIB,45
+sarcopenia+EVEROLIMUS,1080
+sarcopenia+VINORELBINA,370
+sarcopenia+PEMETREXED,533
+sarcopenia+BINIMETINIB,50
+sarcopenia+CARBOPLATINO,1710
+sarcopenia+ABIRATERONA,508
+sarcopenia+DENOSUMAB,3030
+sarcopenia+EVEROLIMUS,1080
+sarcopenia+FILGRASTIM,111
+sarcopenia+FLUOROURACILO,3360
+sarcopenia+DOXORUBICINA,4600
+sarcopenia+GEMCITABINA,2180
+sarcopenia+REGORAFENIB,383
+sarcopenia+SELPERCATINIB,15
+sarcopenia+REMDESIVIR,385
+sarcopenia+ENOXAPARINA,107
+sarcopenia+IRINOTECAN,1250
+sarcopenia+NINTEDANIB,224
+sarcopenia+PLERIXAFOR,39
+sarcopenia+ALPELISIB,50
+sarcopenia+RUCAPARIB,58
+sarcopenia+TEMOZOLOMIDA,482
+sarcopenia+OXALIPLATINO,2060
+sarcopenia+SOTORASIB,25
diff --git a/visualization.ipynb b/visualization.ipynb