Copy of THE ONLY way to SEARCH for TECH JOBS in 2021 (w/ limited Python Knowledge)

  1. Download the following applications
    1. Python
    2. IDE of choice
      1. My personal preference is VSCode
    3. Google Chrome
  2. Install the following packages in Python
    1. Beautifulsoup (BS4)
    2. Requests
    3. xmlx
  3. Create a new python file in VSCode and paste the code below
    1. Credit to Maksym Korzh for creating this, so I don't have too.
      1. https://github.com/maksimKorzh/one-time-scrapers/blob/master/scrapers/google_scraper.py
  4. Update the following parameters below
    1. pagnation_params
    2. initial_params
    3. headers
    4. title
    5. list
    6. description
    7. response
import requests
from bs4 import BeautifulSoup
import json
import csv
import re
import time


class GoogleScraper:
    # Crawler entry point
    base_url = 'https://www.google.com/search'
    
    # Query string parameters to crawl through results pages
    # Change params to change search results
    pagination_params = {
        'q': 'site:lever.co data analytics',
        'sxsrf': 'ALeKk01Geeak1UCX_8yGflQakZAFNM1OUw:1629662394380',
        'ei': 'uqwiYc69FuzT5NoP69uN-AE',
        'start': '10',
        'sa': 'N',
        'ved': '2ahUKEwjOgvvAtcXyAhXsKVkFHettAx8Q8tMDegQIARA5',
        'biw': '445',
        'bih': '1243'
    }
    
    # Query string parameters for initial results page
    initial_params = {
        'sxsrf': 'ALeKk00LkvDQZ6PtGO4v918_z3ABGuPiaA:1629661616523',
        'ei': 'sKkiYfaSH4Cu5NoPzrCHqAQ',
        'start': '',
        'sa': 'N',
        'ved': '2ahUKEwj2qobOssXyAhUAF1kFHU7YAUU4ChDy0wN6BAgWEDg',
        'q': 'site:lever.co+data+analyst',
        'biw': '2283',
        'bih': '1261'
    }
    
    # Request headers
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'no-cache',
        'cookie': 'CGIC=IocBdGV4dC9odG1sLGFwcGxpY2F0aW9uL3hodG1sK3htbCxhcHBsaWNhdGlvbi94bWw7cT0wLjksaW1hZ2UvYXZpZixpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44LGFwcGxpY2F0aW9uL3NpZ25lZC1leGNoYW5nZTt2PWIzO3E9MC45; HSID=AWM4Q5se7_9mX8cDk; SSID=Agdq5OxIHfb4V_E7Q; APISID=vb-onbS21cwqS8rp/A-MshZOaqY9RsF-nV; SAPISID=mIoYK5xcEQ2ZsoZF/AI8nqZr-EaN3UTwpa; __Secure-1PAPISID=mIoYK5xcEQ2ZsoZF/AI8nqZr-EaN3UTwpa; __Secure-3PAPISID=mIoYK5xcEQ2ZsoZF/AI8nqZr-EaN3UTwpa; SEARCH_SAMESITE=CgQIq5MB; SID=BAj766LvAczMMUR1HsvoA9JvNi5fF8FAG7FZf7aoe1wYSVdvncQ0_N1fT5aICI_sAXn2tg.; __Secure-1PSID=BAj766LvAczMMUR1HsvoA9JvNi5fF8FAG7FZf7aoe1wYSVdvlhEYK6Q3AAYUxJAUdQia3w.; __Secure-3PSID=BAj766LvAczMMUR1HsvoA9JvNi5fF8FAG7FZf7aoe1wYSVdvfA_kySRM12gzwKxupnDSpw.; OGPC=19022622-1:; NID=221=yACG0FWNDaVyLSSqFjLTEVtRVTXxtrL2x-eXY2EHiZhyZXs075RfwegGetr6_JODsk36fOoZcrIhm9xRXSYBC6QdhZeAXo__te7_OnMe_5tRTIW2FpNIPvTRlCOt43O07iHZUELEXGZ-4bu84f33e5AtBn2lr-QtRLpVysZIoD5qDyJdqQUcLE7t7WjefFg2n9t0kBrCzNUgmmj4ms_VB0zhlT9z2an-dqsnSxz-vjXJXrq2zHca5mKrVZJi; 1P_JAR=2021-08-22-19; DV=o7NFhLF6eMR3MD3d-ztkLcdapCX3ttfemkIIzV1mQQAAAPADV2lI7DiWlwAAAETt-h2qKyYNKgAAAAKrLPZJ7nerm6EMgPOZQ_qTz7YnaCgDEEeazSD-B7uSGsoAAA; UULE=a+cm9sZTogMQpwcm9kdWNlcjogMTIKdGltZXN0YW1wOiAxNjI5NjYyMzcxMzc5MDAwCmxhdGxuZyB7CiAgbGF0aXR1ZGVfZTc6IDQwNzc5NzA1OAogIGxvbmdpdHVkZV9lNzogLTczOTQ3NzI2OQp9CnJhZGl1czogMTM2MTgwNy41MTAwNDQzMDUxCnByb3ZlbmFuY2U6IDYK; SIDCC=AJi4QfG2wGkyogwvEVnlHAq7XzyTVN8Yezmf2iBSZoSgbjLQjM7hddO5pEnLUdRJm1ni4k5uTg; __Secure-3PSIDCC=AJi4QfGg8-k-BWOlHMXCSt6DV73eM0PSXWC9smtrqul28Cw1sE1IyimiROHkpg6o-jHMmXsLtg',
        'pragma': 'no-cache',
        'referer': 'https://www.google.com/',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
    }
    
    # Scraped results
    results = []
    
    def fetch(self, query, page):
        '''Makes HTTP GET request to fetch search results from google'''
        
        # Init initial_params search query (e.g. "linux mint")
        self.initial_params['q'] = query
        
        # If getting the first results page
        if not page:
            # Use initial params
            params = self.initial_params
        
        # Otherwise we're scraping the following pages
        else:
            # Use pagination params
            params = self.pagination_params
            
            # Specify page number in format page * 10
            params['start'] = str(page * 10)
            
            # Init search query
            params['q'] = query
        
        # Make HTTP GET request
        response = requests.get(self.base_url, params=params, headers=self.headers)
        print('HTTP GET request to URL: %s | Status code: %s' % (response.url, response.status_code))
        
        # Return HTTP response
        return response
        
    def parse(self, html):
        '''Parses response's text and extract data from it'''
        
        # Parse content
        content = BeautifulSoup(html, 'lxml')
        
        # Extract data
        ## Change class if running new q
        title = [title.text for title in content.findAll('h3', {'class': 'LC20lb DKV0Md'})]
        link = [link.next_element['href'] for link in content.findAll('div', {'class': 'yuRUbf'})]
        description = [descr.text for descr in content.findAll('div', {'class': 'VwiC3b MUxGbd yDYNvb lyLwlc lEBKkf'})]
        
        # Loop over the number of entries
        for index in range(0, len(title)):
            # Append extracted data to results list
            self.results.append({
                'title': title[index],
                'link': link[index],
                'description': description[index]
            })
    
    def write_csv(self):
        '''Writes scpared results to CSV file'''
        
        # Check results list in not empty
        if len(self.results):
            print('Writing results to "lever_res.csv"... ', end='')
            
            # Open file stream to write CSV
            with open('lever_res.csv', 'w') as csv_file:
                # Init CSV dictionary writer
                writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys())
                
                # Write column names to file
                writer.writeheader()
                
                # Write results list to CSV file
                for row in self.results:
                    writer.writerow(row)
            
            print('Done')
       
    def store_response(self, response):
        '''Stores HTML response to file for debugging parser'''
        
        # If response is OK
        if response.status_code == 200:
            print('Saving response to "res.html"... ', end='')
            
            # Write response to HTML file
            with open('res.html', 'w') as html_file:
                html_file.write(response.text)
            
            print('Done')
        else:
            print('Bad response!')
    
    def load_response(self):
        '''Loads HTML response for debugging parser'''
        html = ''
        
        # Open HTML file
        with open('res.html', 'r') as html_file:
            for line in html_file.read():
                html += line
        
        # Return HTML as string
        return html
        
    def run(self):
        '''Starts crawler'''

        # Loop over the range of pages to scrape
        ## Insert in the 
        for page in range(0, 5):
            # Make HTTP GET request
            response = self.fetch('site:lever.co data analytics', page) 
            
            # Parse content
            self.parse(response.text)
            
            # Wait for 5 sec            
            time.sleep(5)
        
        # Write scraped results to CSV file
        self.write_csv()


# Main driver
if __name__ == '__main__':
    scraper = GoogleScraper()
    scraper.run()