A complimentary, programmatic approach to all of the all great answers above using Python and Google Scholar Organic Results API from SerpApi.
It's a paid API with a free plan that bypasses blocks from Google and does all the hard lifting so the end-user only needs to think about what data to extract.
Code and example in the online IDE to extract data from all pages:
import os, json
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
params = {
"api_key": os.getenv("API_KEY"), # your serpapi API key
"engine": "google_scholar",
"q": "AI source:NIPS", # search query
"hl": "en", # language
# "as_ylo": "2017", # from 2017
# "as_yhi": "2021", # to 2021
"start": "0" # first page
}
search = GoogleSearch(params)
organic_results_data = []
papers_is_present = True
while papers_is_present:
results = search.get_dict()
print(f"Currently extracting page №{results.get('serpapi_pagination', {}).get('current')}..")
for result in results["organic_results"]:
position = result["position"]
title = result["title"]
publication_info_summary = result["publication_info"]["summary"]
result_id = result["result_id"]
link = result.get("link")
result_type = result.get("type")
snippet = result.get("snippet")
organic_results_data.append({
"page_number": results.get("serpapi_pagination", {}).get("current"),
"position": position + 1,
"result_type": result_type,
"title": title,
"link": link,
"result_id": result_id,
"publication_info_summary": publication_info_summary,
"snippet": snippet,
})
if "next" in results.get("serpapi_pagination", {}):
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
papers_is_present = False
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Part of the output:
[
{
"page_number": 1,
"position": 1,
"result_type": "Pdf",
"title": "Nuts and bolts of building AI applications using Deep Learning",
"link": "https://media.nips.cc/Conferences/2016/Slides/6203-Slides.pdf",
"result_id": "-x2la-_xce0J",
"publication_info_summary": "A Ng - NIPS Keynote Talk, 2016 - media.nips.cc",
"snippet": "Given the safety-critical requirement of autonomous driving and thus the need for extremely high levels of accuracy, a pure end-to-end approach is still challenging to get to work. End-…"
}, ... other results
]
Custom solution:
from parsel import Selector
import requests, json, os
def scrape_conference_publications(query: str, source: list[str]):
if source:
# source:NIPS OR source:Neural Information
sources = " OR ".join([f'source:{item}' for item in source])
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": f'{query.lower()} {sources}', # search query
"hl": "en", # language of the search
"gl": "us" # country of the search
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
html = requests.get("https://scholar.google.com/scholar", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
publications = []
for result in selector.css(".gs_r.gs_scl"):
title = result.css(".gs_rt").xpath("normalize-space()").get()
link = result.css(".gs_rt a::attr(href)").get()
result_id = result.attrib["data-cid"]
snippet = result.css(".gs_rs::text").get()
publication_info = result.css(".gs_a").xpath("normalize-space()").get()
cite_by_link = f'https://scholar.google.com/scholar{result.css(".gs_or_btn.gs_nph+ a::attr(href)").get()}'
all_versions_link = f'https://scholar.google.com/scholar{result.css("a~ a+ .gs_nph::attr(href)").get()}'
related_articles_link = f'https://scholar.google.com/scholar{result.css("a:nth-child(4)::attr(href)").get()}'
pdf_file_title = result.css(".gs_or_ggsm a").xpath("normalize-space()").get()
pdf_file_link = result.css(".gs_or_ggsm a::attr(href)").get()
publications.append({
"result_id": result_id,
"title": title,
"link": link,
"snippet": snippet,
"publication_info": publication_info,
"cite_by_link": cite_by_link,
"all_versions_link": all_versions_link,
"related_articles_link": related_articles_link,
"pdf": {
"title": pdf_file_title,
"link": pdf_file_link
}
})
# or return publications instead
# return publications
print(json.dumps(publications, indent=2, ensure_ascii=False))
scrape_conference_publications(query="anatomy", source=["NIPS", "Neural Information"])
Outputs:
[
{
"result_id": "hjgaRkq_oOEJ",
"title": "Differential representation of arm movement direction in relation to cortical anatomy and function",
"link": "https://iopscience.iop.org/article/10.1088/1741-2560/6/1/016006/meta",
"snippet": "… ",
"publication_info": "T Ball, A Schulze-Bonhage, A Aertsen… - Journal of neural …, 2009 - iopscience.iop.org",
"cite_by_link": "https://scholar.google.com/scholar/scholar?cites=16258204980532099206&as_sdt=2005&sciodt=0,5&hl=en",
"all_versions_link": "https://scholar.google.com/scholar/scholar?cluster=16258204980532099206&hl=en&as_sdt=0,5",
"related_articles_link": "https://scholar.google.com/scholar/scholar?q=related:hjgaRkq_oOEJ:scholar.google.com/&scioq=anatomy+source:NIPS+OR+source:Neural+Information&hl=en&as_sdt=0,5",
"pdf": {
"title": "[PDF] psu.edu",
"link": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.324.1523&rep=rep1&type=pdf"
}
}, ... other results
]
You can also have a look at a dedicated, step-by-step blog post Scrape Google Scholar Papers within a particular conference in Python at SerpApi just about it.
Disclaimer, I work for SerpApi.