import requests
from bs4 import BeautifulSoup
import json
import csv
import re
import time
class GoogleScraper:
# Crawler entry point
base_url = 'https://www.google.com/search'
# Query string parameters to crawl through results pages
# Change params to change search results
pagination_params = {
'q': 'site:lever.co data analytics',
'sxsrf': 'ALeKk01Geeak1UCX_8yGflQakZAFNM1OUw:1629662394380',
'ei': 'uqwiYc69FuzT5NoP69uN-AE',
'start': '10',
'sa': 'N',
'ved': '2ahUKEwjOgvvAtcXyAhXsKVkFHettAx8Q8tMDegQIARA5',
'biw': '445',
'bih': '1243'
}
# Query string parameters for initial results page
initial_params = {
'sxsrf': 'ALeKk00LkvDQZ6PtGO4v918_z3ABGuPiaA:1629661616523',
'ei': 'sKkiYfaSH4Cu5NoPzrCHqAQ',
'start': '',
'sa': 'N',
'ved': '2ahUKEwj2qobOssXyAhUAF1kFHU7YAUU4ChDy0wN6BAgWEDg',
'q': 'site:lever.co+data+analyst',
'biw': '2283',
'bih': '1261'
}
# Request headers
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'no-cache',
'cookie': 'CGIC=IocBdGV4dC9odG1sLGFwcGxpY2F0aW9uL3hodG1sK3htbCxhcHBsaWNhdGlvbi94bWw7cT0wLjksaW1hZ2UvYXZpZixpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44LGFwcGxpY2F0aW9uL3NpZ25lZC1leGNoYW5nZTt2PWIzO3E9MC45; HSID=AWM4Q5se7_9mX8cDk; SSID=Agdq5OxIHfb4V_E7Q; APISID=vb-onbS21cwqS8rp/A-MshZOaqY9RsF-nV; SAPISID=mIoYK5xcEQ2ZsoZF/AI8nqZr-EaN3UTwpa; __Secure-1PAPISID=mIoYK5xcEQ2ZsoZF/AI8nqZr-EaN3UTwpa; __Secure-3PAPISID=mIoYK5xcEQ2ZsoZF/AI8nqZr-EaN3UTwpa; SEARCH_SAMESITE=CgQIq5MB; SID=BAj766LvAczMMUR1HsvoA9JvNi5fF8FAG7FZf7aoe1wYSVdvncQ0_N1fT5aICI_sAXn2tg.; __Secure-1PSID=BAj766LvAczMMUR1HsvoA9JvNi5fF8FAG7FZf7aoe1wYSVdvlhEYK6Q3AAYUxJAUdQia3w.; __Secure-3PSID=BAj766LvAczMMUR1HsvoA9JvNi5fF8FAG7FZf7aoe1wYSVdvfA_kySRM12gzwKxupnDSpw.; OGPC=19022622-1:; NID=221=yACG0FWNDaVyLSSqFjLTEVtRVTXxtrL2x-eXY2EHiZhyZXs075RfwegGetr6_JODsk36fOoZcrIhm9xRXSYBC6QdhZeAXo__te7_OnMe_5tRTIW2FpNIPvTRlCOt43O07iHZUELEXGZ-4bu84f33e5AtBn2lr-QtRLpVysZIoD5qDyJdqQUcLE7t7WjefFg2n9t0kBrCzNUgmmj4ms_VB0zhlT9z2an-dqsnSxz-vjXJXrq2zHca5mKrVZJi; 1P_JAR=2021-08-22-19; DV=o7NFhLF6eMR3MD3d-ztkLcdapCX3ttfemkIIzV1mQQAAAPADV2lI7DiWlwAAAETt-h2qKyYNKgAAAAKrLPZJ7nerm6EMgPOZQ_qTz7YnaCgDEEeazSD-B7uSGsoAAA; UULE=a+cm9sZTogMQpwcm9kdWNlcjogMTIKdGltZXN0YW1wOiAxNjI5NjYyMzcxMzc5MDAwCmxhdGxuZyB7CiAgbGF0aXR1ZGVfZTc6IDQwNzc5NzA1OAogIGxvbmdpdHVkZV9lNzogLTczOTQ3NzI2OQp9CnJhZGl1czogMTM2MTgwNy41MTAwNDQzMDUxCnByb3ZlbmFuY2U6IDYK; SIDCC=AJi4QfG2wGkyogwvEVnlHAq7XzyTVN8Yezmf2iBSZoSgbjLQjM7hddO5pEnLUdRJm1ni4k5uTg; __Secure-3PSIDCC=AJi4QfGg8-k-BWOlHMXCSt6DV73eM0PSXWC9smtrqul28Cw1sE1IyimiROHkpg6o-jHMmXsLtg',
'pragma': 'no-cache',
'referer': 'https://www.google.com/',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
# Scraped results
results = []
def fetch(self, query, page):
'''Makes HTTP GET request to fetch search results from google'''
# Init initial_params search query (e.g. "linux mint")
self.initial_params['q'] = query
# If getting the first results page
if not page:
# Use initial params
params = self.initial_params
# Otherwise we're scraping the following pages
else:
# Use pagination params
params = self.pagination_params
# Specify page number in format page * 10
params['start'] = str(page * 10)
# Init search query
params['q'] = query
# Make HTTP GET request
response = requests.get(self.base_url, params=params, headers=self.headers)
print('HTTP GET request to URL: %s | Status code: %s' % (response.url, response.status_code))
# Return HTTP response
return response
def parse(self, html):
'''Parses response's text and extract data from it'''
# Parse content
content = BeautifulSoup(html, 'lxml')
# Extract data
## Change class if running new q
title = [title.text for title in content.findAll('h3', {'class': 'LC20lb DKV0Md'})]
link = [link.next_element['href'] for link in content.findAll('div', {'class': 'yuRUbf'})]
description = [descr.text for descr in content.findAll('div', {'class': 'VwiC3b MUxGbd yDYNvb lyLwlc lEBKkf'})]
# Loop over the number of entries
for index in range(0, len(title)):
# Append extracted data to results list
self.results.append({
'title': title[index],
'link': link[index],
'description': description[index]
})
def write_csv(self):
'''Writes scpared results to CSV file'''
# Check results list in not empty
if len(self.results):
print('Writing results to "lever_res.csv"... ', end='')
# Open file stream to write CSV
with open('lever_res.csv', 'w') as csv_file:
# Init CSV dictionary writer
writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
# Write column names to file
writer.writeheader()
# Write results list to CSV file
for row in self.results:
writer.writerow(row)
print('Done')
def store_response(self, response):
'''Stores HTML response to file for debugging parser'''
# If response is OK
if response.status_code == 200:
print('Saving response to "res.html"... ', end='')
# Write response to HTML file
with open('res.html', 'w') as html_file:
html_file.write(response.text)
print('Done')
else:
print('Bad response!')
def load_response(self):
'''Loads HTML response for debugging parser'''
html = ''
# Open HTML file
with open('res.html', 'r') as html_file:
for line in html_file.read():
html += line
# Return HTML as string
return html
def run(self):
'''Starts crawler'''
# Loop over the range of pages to scrape
## Insert in the
for page in range(0, 5):
# Make HTTP GET request
response = self.fetch('site:lever.co data analytics', page)
# Parse content
self.parse(response.text)
# Wait for 5 sec
time.sleep(5)
# Write scraped results to CSV file
self.write_csv()
# Main driver
if __name__ == '__main__':
scraper = GoogleScraper()
scraper.run()