I built a webscraper to take all the historical data from CoinMarketCap.com. A site which records historical data for cryptocurrencies which unfortunately does not have a feature to download all of their data in one convenient file.
It works but it's somewhat slow and probably could be designed better. I'm self taught so I'd love some advice on how to improve the code's functionality, speed, and readability. Thanks a lot in advance.
def scrape_coin_marketcap():
print('Loading Packages...')
from bs4 import BeautifulSoup
import requests
import pandas as pd
print('Scraping links...')
#Download HTML of links to historical CoinMarketCap data
url = 'https://coinmarketcap.com/historical/'
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data,'html.parser')
#scrape a list of links to historical data
raw_links = []
for link in soup.find_all('a'):
raw_links.append(link.get('href'))
#Remove non-historical links
historical_links = []
for link in raw_links:
if "201" in str(link):
historical_links.append('https://coinmarketcap.com' + link)
print('Scraping Data....')
#Scrape historical data from each time period
master_df = pd.DataFrame()
num_links = len(historical_links)
print(str(num_links) + " dates to be scraped...")
for link in historical_links:
try:
res = requests.get(link)
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
date = str(soup.find_all('h1')[0])[51:-5]
df['date'] = date
master_df = master_df.append(df)
print(" Scraping: " + str(date))
except:
print(" ERROR Scraping: " + str(link))
print('Saving to disk...')
master_df.to_csv('CoinMarketCap.csv', index = False)
print("Completed.")
if __name__ == "__main__":
scrape_coin_marketcap()