|
| 1 | +# Author: Bishal Sarang |
| 2 | +import json |
| 3 | +import pickle |
| 4 | +import time |
| 5 | + |
| 6 | +import bs4 |
| 7 | +import colorama |
| 8 | +import requests |
| 9 | +from colorama import Back, Fore |
| 10 | +from ebooklib import epub |
| 11 | +from selenium import webdriver |
| 12 | +from selenium.webdriver.chrome.options import Options |
| 13 | +from selenium.webdriver.common.by import By |
| 14 | +from selenium.webdriver.support import expected_conditions as EC |
| 15 | +from selenium.webdriver.support.ui import WebDriverWait |
| 16 | +from utils import * |
| 17 | +import epub_writer |
| 18 | + |
| 19 | +# Initialize Colorama |
| 20 | +colorama.init(autoreset=True) |
| 21 | + |
| 22 | +# Setup Selenium Webdriver |
| 23 | +CHROMEDRIVER_PATH = r"./driver/chromedriver.exe" |
| 24 | +options = Options() |
| 25 | +options.headless = True |
| 26 | +# Disable Warning, Error and Info logs |
| 27 | +# Show only fatal errors |
| 28 | +options.add_argument("--log-level=3") |
| 29 | +driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=options) |
| 30 | + |
| 31 | + |
| 32 | +# Get upto which problem it is already scraped from track.conf file |
| 33 | +completed_upto = read_tracker("track.conf") |
| 34 | + |
| 35 | +# Load chapters list that stores chapter info |
| 36 | +# Store chapter info |
| 37 | +with open('chapters.pickle', 'rb') as f: |
| 38 | + chapters = pickle.load(f) |
| 39 | + |
| 40 | +def download(problem_num, url, title, solution_slug): |
| 41 | + print(Fore.BLACK + Back.CYAN + f"Fetching problem num " + Back.YELLOW + f" {problem_num} " + Back.CYAN + " with url " + Back.YELLOW + f" {url} ") |
| 42 | + n = len(title) |
| 43 | + |
| 44 | + try: |
| 45 | + |
| 46 | + driver.get(url) |
| 47 | + # Wait 20 secs or until div with id initial-loading disappears |
| 48 | + element = WebDriverWait(driver, 20).until( |
| 49 | + EC.invisibility_of_element_located((By.ID, "initial-loading")) |
| 50 | + ) |
| 51 | + # Get current tab page source |
| 52 | + html = driver.page_source |
| 53 | + soup = bs4.BeautifulSoup(html, "html.parser") |
| 54 | + |
| 55 | + # Construct HTML |
| 56 | + title_decorator = '*' * n |
| 57 | + problem_title_html = title_decorator + f'<div id="title">{title}</div>' + '\n' + title_decorator |
| 58 | + problem_html = problem_title_html + str(soup.find("div", {"class": "content__u3I1 question-content__JfgR"})) + '<br><br><hr><br>' |
| 59 | + |
| 60 | + # Append Contents to a HTML file |
| 61 | + with open("out.html", "ab") as f: |
| 62 | + f.write(problem_html.encode(encoding="utf-8")) |
| 63 | + |
| 64 | + # create and append chapters to construct an epub |
| 65 | + c = epub.EpubHtml(title=title, file_name=f'chap_{problem_num}.xhtml', lang='hr') |
| 66 | + c.content = problem_html |
| 67 | + chapters.append(c) |
| 68 | + |
| 69 | + |
| 70 | + # Write List of chapters to pickle file |
| 71 | + dump_chapters_to_file(chapters) |
| 72 | + # Update upto which the problem is downloaded |
| 73 | + update_tracker('track.conf', problem_num) |
| 74 | + print(Fore.BLACK + Back.GREEN + f"Writing problem num " + Back.YELLOW + f" {problem_num} " + Back.GREEN + " with url " + Back.YELLOW + f" {url} " ) |
| 75 | + print(Fore.BLACK + Back.GREEN + " successfull ") |
| 76 | + # print(f"Writing problem num {problem_num} with url {url} successfull") |
| 77 | + |
| 78 | + except Exception as e: |
| 79 | + print(Back.RED + f" Failed Writing!! {e} ") |
| 80 | + driver.quit() |
| 81 | + |
| 82 | +def main(): |
| 83 | + |
| 84 | + # Leetcode API URL to get json of problems on algorithms categories |
| 85 | + ALGORITHMS_ENDPOINT_URL = "https://leetcode.com/api/problems/algorithms/" |
| 86 | + |
| 87 | + # Problem URL is of format ALGORITHMS_BASE_URL + question__title_slug |
| 88 | + # If question__title_slug = "two-sum" then URL is https://leetcode.com/problems/two-sum |
| 89 | + ALGORITHMS_BASE_URL = "https://leetcode.com/problems/" |
| 90 | + |
| 91 | + # Load JSON from API |
| 92 | + algorithms_problems_json = requests.get(ALGORITHMS_ENDPOINT_URL).content |
| 93 | + algorithms_problems_json = json.loads(algorithms_problems_json) |
| 94 | + |
| 95 | + styles_str = "<style>pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}</style>" |
| 96 | + with open("out.html", "ab") as f: |
| 97 | + f.write(styles_str.encode(encoding="utf-8")) |
| 98 | + |
| 99 | + # List to store question_title_slug |
| 100 | + links = [] |
| 101 | + for child in algorithms_problems_json["stat_status_pairs"]: |
| 102 | + # Only process free problems |
| 103 | + if not child["paid_only"]: |
| 104 | + question__title_slug = child["stat"]["question__title_slug"] |
| 105 | + question__article__slug = child["stat"]["question__article__slug"] |
| 106 | + question__title = child["stat"]["question__title"] |
| 107 | + frontend_question_id = child["stat"]["frontend_question_id"] |
| 108 | + difficulty = child["difficulty"]["level"] |
| 109 | + links.append((question__title_slug, difficulty, frontend_question_id, question__title, question__article__slug)) |
| 110 | + |
| 111 | + # Sort by difficulty follwed by problem id in ascending order |
| 112 | + links = sorted(links, key=lambda x: (x[1], x[2])) |
| 113 | + |
| 114 | + try: |
| 115 | + for i in range(completed_upto + 1, len(links)): |
| 116 | + question__title_slug, _ , frontend_question_id, question__title, question__article__slug = links[i] |
| 117 | + url = ALGORITHMS_BASE_URL + question__title_slug |
| 118 | + title = f"{frontend_question_id}. {question__title}" |
| 119 | + |
| 120 | + # Download each file as html and write chapter to chapters.pickle |
| 121 | + download(i, url , title, question__article__slug) |
| 122 | + |
| 123 | + # Sleep for 20 secs for each problem and 2 minns after every 30 problems |
| 124 | + if i % 30 == 0: |
| 125 | + print(f"Sleeping 120 secs\n") |
| 126 | + time.sleep(120) |
| 127 | + else: |
| 128 | + print(f"Sleeping 20 secs\n") |
| 129 | + time.sleep(5) |
| 130 | + |
| 131 | + finally: |
| 132 | + # Close the browser after download |
| 133 | + driver.quit() |
| 134 | + |
| 135 | + try: |
| 136 | + epub_writer.write("Leetcode Questions.epub", "Leetcode Questions", "Anonymous", chapters) |
| 137 | + print(Back.GREEN + "All operations successful") |
| 138 | + except Exception as e: |
| 139 | + print(Back.RED + f"Error making epub {e}") |
| 140 | + |
| 141 | + |
| 142 | + |
| 143 | +if __name__ == "__main__": |
| 144 | + main() |
0 commit comments