Skip to content

Commit 7483969

Browse files
author
Bishal Sarangkoti
authored
Initial Commit
1 parent 61b9db7 commit 7483969

File tree

6 files changed

+243
-0
lines changed

6 files changed

+243
-0
lines changed

chapters.pickle

6 Bytes
Binary file not shown.

epub_writer.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from ebooklib import epub
2+
import pickle
3+
4+
def write(file_name, title, author, chapters):
5+
# Ebook
6+
book = epub.EpubBook()
7+
8+
# set metadata
9+
book.set_identifier('id123456')
10+
book.set_title(title)
11+
book.set_language('en')
12+
book.add_author(author)
13+
book.add_author('Anonymous', file_as='Anonymous', role='ill', uid='coauthor')
14+
15+
toc = []
16+
spine = ['nav']
17+
# For each chapter add chapter to the book, TOC and spine
18+
for chapter in chapters:
19+
book.add_item(chapter)
20+
toc.append(epub.Link(chapter.file_name, chapter.title, chapter.title))
21+
spine.append(chapter)
22+
23+
# define Table Of Contents
24+
book.toc = tuple(toc)
25+
26+
# add default NCX and Nav file
27+
book.add_item(epub.EpubNcx())
28+
book.add_item(epub.EpubNav())
29+
30+
# define CSS style
31+
style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'
32+
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
33+
34+
# add CSS file
35+
book.add_item(nav_css)
36+
37+
# basic spine
38+
book.spine = spine
39+
40+
# write to the file
41+
epub.write_epub(file_name, book, {})
42+
43+
44+
def main():
45+
# Load chapters list that stores chapter info
46+
# Store chapter info
47+
with open('chapters.pickle', 'rb') as f:
48+
chapters = pickle.load(f)
49+
50+
51+
write("Leetcode Questions.epub", "Leetcode Questions", "Anonymous", chapters)
52+
53+
if __name__ == "__main__":
54+
main()

main.py

+144
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Author: Bishal Sarang
2+
import json
3+
import pickle
4+
import time
5+
6+
import bs4
7+
import colorama
8+
import requests
9+
from colorama import Back, Fore
10+
from ebooklib import epub
11+
from selenium import webdriver
12+
from selenium.webdriver.chrome.options import Options
13+
from selenium.webdriver.common.by import By
14+
from selenium.webdriver.support import expected_conditions as EC
15+
from selenium.webdriver.support.ui import WebDriverWait
16+
from utils import *
17+
import epub_writer
18+
19+
# Initialize Colorama
20+
colorama.init(autoreset=True)
21+
22+
# Setup Selenium Webdriver
23+
CHROMEDRIVER_PATH = r"./driver/chromedriver.exe"
24+
options = Options()
25+
options.headless = True
26+
# Disable Warning, Error and Info logs
27+
# Show only fatal errors
28+
options.add_argument("--log-level=3")
29+
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=options)
30+
31+
32+
# Get upto which problem it is already scraped from track.conf file
33+
completed_upto = read_tracker("track.conf")
34+
35+
# Load chapters list that stores chapter info
36+
# Store chapter info
37+
with open('chapters.pickle', 'rb') as f:
38+
chapters = pickle.load(f)
39+
40+
def download(problem_num, url, title, solution_slug):
41+
print(Fore.BLACK + Back.CYAN + f"Fetching problem num " + Back.YELLOW + f" {problem_num} " + Back.CYAN + " with url " + Back.YELLOW + f" {url} ")
42+
n = len(title)
43+
44+
try:
45+
46+
driver.get(url)
47+
# Wait 20 secs or until div with id initial-loading disappears
48+
element = WebDriverWait(driver, 20).until(
49+
EC.invisibility_of_element_located((By.ID, "initial-loading"))
50+
)
51+
# Get current tab page source
52+
html = driver.page_source
53+
soup = bs4.BeautifulSoup(html, "html.parser")
54+
55+
# Construct HTML
56+
title_decorator = '*' * n
57+
problem_title_html = title_decorator + f'<div id="title">{title}</div>' + '\n' + title_decorator
58+
problem_html = problem_title_html + str(soup.find("div", {"class": "content__u3I1 question-content__JfgR"})) + '<br><br><hr><br>'
59+
60+
# Append Contents to a HTML file
61+
with open("out.html", "ab") as f:
62+
f.write(problem_html.encode(encoding="utf-8"))
63+
64+
# create and append chapters to construct an epub
65+
c = epub.EpubHtml(title=title, file_name=f'chap_{problem_num}.xhtml', lang='hr')
66+
c.content = problem_html
67+
chapters.append(c)
68+
69+
70+
# Write List of chapters to pickle file
71+
dump_chapters_to_file(chapters)
72+
# Update upto which the problem is downloaded
73+
update_tracker('track.conf', problem_num)
74+
print(Fore.BLACK + Back.GREEN + f"Writing problem num " + Back.YELLOW + f" {problem_num} " + Back.GREEN + " with url " + Back.YELLOW + f" {url} " )
75+
print(Fore.BLACK + Back.GREEN + " successfull ")
76+
# print(f"Writing problem num {problem_num} with url {url} successfull")
77+
78+
except Exception as e:
79+
print(Back.RED + f" Failed Writing!! {e} ")
80+
driver.quit()
81+
82+
def main():
83+
84+
# Leetcode API URL to get json of problems on algorithms categories
85+
ALGORITHMS_ENDPOINT_URL = "https://leetcode.com/api/problems/algorithms/"
86+
87+
# Problem URL is of format ALGORITHMS_BASE_URL + question__title_slug
88+
# If question__title_slug = "two-sum" then URL is https://leetcode.com/problems/two-sum
89+
ALGORITHMS_BASE_URL = "https://leetcode.com/problems/"
90+
91+
# Load JSON from API
92+
algorithms_problems_json = requests.get(ALGORITHMS_ENDPOINT_URL).content
93+
algorithms_problems_json = json.loads(algorithms_problems_json)
94+
95+
styles_str = "<style>pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}</style>"
96+
with open("out.html", "ab") as f:
97+
f.write(styles_str.encode(encoding="utf-8"))
98+
99+
# List to store question_title_slug
100+
links = []
101+
for child in algorithms_problems_json["stat_status_pairs"]:
102+
# Only process free problems
103+
if not child["paid_only"]:
104+
question__title_slug = child["stat"]["question__title_slug"]
105+
question__article__slug = child["stat"]["question__article__slug"]
106+
question__title = child["stat"]["question__title"]
107+
frontend_question_id = child["stat"]["frontend_question_id"]
108+
difficulty = child["difficulty"]["level"]
109+
links.append((question__title_slug, difficulty, frontend_question_id, question__title, question__article__slug))
110+
111+
# Sort by difficulty follwed by problem id in ascending order
112+
links = sorted(links, key=lambda x: (x[1], x[2]))
113+
114+
try:
115+
for i in range(completed_upto + 1, len(links)):
116+
question__title_slug, _ , frontend_question_id, question__title, question__article__slug = links[i]
117+
url = ALGORITHMS_BASE_URL + question__title_slug
118+
title = f"{frontend_question_id}. {question__title}"
119+
120+
# Download each file as html and write chapter to chapters.pickle
121+
download(i, url , title, question__article__slug)
122+
123+
# Sleep for 20 secs for each problem and 2 minns after every 30 problems
124+
if i % 30 == 0:
125+
print(f"Sleeping 120 secs\n")
126+
time.sleep(120)
127+
else:
128+
print(f"Sleeping 20 secs\n")
129+
time.sleep(5)
130+
131+
finally:
132+
# Close the browser after download
133+
driver.quit()
134+
135+
try:
136+
epub_writer.write("Leetcode Questions.epub", "Leetcode Questions", "Anonymous", chapters)
137+
print(Back.GREEN + "All operations successful")
138+
except Exception as e:
139+
print(Back.RED + f"Error making epub {e}")
140+
141+
142+
143+
if __name__ == "__main__":
144+
main()

out.html

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

track.conf

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-1

utils.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
Contains utility function to update upto which the problems has been downloaded, writing chapter info to a file, resetting configuration,
3+
reading upto which the problems has been downloaded
4+
"""
5+
import pickle
6+
7+
def update_tracker(file_name, problem_num):
8+
"""
9+
10+
"""
11+
with open(file_name, "w") as f:
12+
f.write(str(problem_num))
13+
14+
def dump_chapters_to_file(chapters):
15+
"""
16+
17+
"""
18+
with open('chapters.pickle', 'wb') as f:
19+
pickle.dump(chapters, f)
20+
21+
def reset_configuration():
22+
"""
23+
Resets problem num downloaded upto to -1
24+
Resets all the chapters
25+
Resets html file
26+
"""
27+
update_tracker("track.conf", -1)
28+
dump_chapters_to_file([])
29+
30+
with open("out.html", "wb") as f:
31+
f.write(b" ")
32+
33+
34+
def read_tracker(file_name):
35+
"""
36+
37+
"""
38+
with open(file_name, "r") as f:
39+
return int(f.readline())
40+
41+
42+
43+

0 commit comments

Comments
 (0)