Skip to content

Commit 96ea50a

Browse files
committed
notebook
1 parent ee37a13 commit 96ea50a

File tree

276 files changed

+102064
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

276 files changed

+102064
-0
lines changed

Diff for: notebook/fetch_cookbook.py

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Author: Seaky
4+
# @Date: 2019/6/19 14:40
5+
6+
# pip install requests beautifulsoup4
7+
8+
import json
9+
import re
10+
from copy import deepcopy
11+
from pathlib import Path
12+
13+
import requests
14+
from bs4 import BeautifulSoup
15+
16+
TEMPLATE = {
17+
"cells": [],
18+
"metadata": {
19+
"kernelspec": {
20+
"display_name": "Python 3",
21+
"language": "python",
22+
"name": "python3"
23+
},
24+
"language_info": {
25+
"codemirror_mode": {
26+
"name": "ipython",
27+
"version": 3
28+
},
29+
"file_extension": ".py",
30+
"mimetype": "text/x-python",
31+
"name": "python",
32+
"nbconvert_exporter": "python",
33+
"pygments_lexer": "ipython3",
34+
"version": "3.7.1"
35+
},
36+
"toc": {
37+
"base_numbering": 1,
38+
"nav_menu": {},
39+
"number_sections": True,
40+
"sideBar": True,
41+
"skip_h1_title": True,
42+
"title_cell": "Table of Contents",
43+
"title_sidebar": "Contents",
44+
"toc_cell": False,
45+
"toc_position": {},
46+
"toc_section_display": True,
47+
"toc_window_display": True
48+
}
49+
},
50+
"nbformat": 4,
51+
"nbformat_minor": 2
52+
}
53+
54+
55+
class Chapter:
56+
def __init__(self, chapter_address):
57+
self.chapter_address = chapter_address
58+
self.path = re.sub('/[^/]+?$', '/', chapter_address)
59+
self.ss = requests.session()
60+
61+
def fetch(self, url):
62+
headers = {
63+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
64+
print(url)
65+
raw = self.ss.get(url, headers=headers).content
66+
m = re.search('charset=\W*(?P<charset>\w+)', raw[:200].decode(errors='ignore'))
67+
charset = m.groupdict().get('charset', 'utf-8')
68+
if charset == 'gb2312':
69+
charset = 'cp936'
70+
return raw.decode(encoding=charset)
71+
72+
def fetch_list(self):
73+
content = self.fetch(self.chapter_address)
74+
soup = BeautifulSoup(content, 'html.parser')
75+
self.chapter_title = soup.find('h1').text.replace('¶', '')
76+
self.chapter_desc = soup.find('p').text
77+
self.sections = []
78+
for x in soup.find_all('a', class_='reference internal', href=re.compile('/p\d+_')):
79+
if x['href'] not in self.sections:
80+
self.sections.append(x['href'])
81+
82+
def fetch_sections(self, sep=False):
83+
cells = [{
84+
"cell_type": "markdown",
85+
"metadata": {},
86+
"source": ['# {}\n {}'.format(self.chapter_title, self.chapter_desc)]
87+
}]
88+
dpath = Path('ipynb')
89+
dpath.mkdir(exist_ok=True)
90+
for href in self.sections[:]:
91+
_cells = self.fetch_content(self.path + href)
92+
if sep:
93+
_dpath = dpath / self.chapter_title
94+
_dpath.mkdir(exist_ok=True)
95+
TEMPLATE['cells'] = _cells
96+
*_, section_name = href.split('/')
97+
open(str(_dpath / '{}.ipynb'.format(section_name.split('.')[0])), 'w').write(json.dumps(TEMPLATE, indent=2))
98+
cells.extend(_cells)
99+
TEMPLATE['cells'] = cells
100+
open(str(dpath / '{}.ipynb'.format(self.chapter_title)), 'w').write(json.dumps(TEMPLATE, indent=2))
101+
102+
def fetch_content(self, url):
103+
content = self.fetch(url)
104+
soup = BeautifulSoup(content, 'html.parser')
105+
106+
cell_markdown = {
107+
"cell_type": "markdown",
108+
"metadata": {},
109+
"source": []
110+
}
111+
cell_code = {
112+
"cell_type": "code",
113+
"execution_count": None,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": []
117+
}
118+
cells = []
119+
p_header = re.compile('^h(?P<level>\d)$')
120+
for tag in [x for x in soup.descendants if x.name]:
121+
if p_header.search(tag.name):
122+
cell = deepcopy(cell_markdown)
123+
cell['source'].append(
124+
'{} {}\n'.format('#' * (int(p_header.search(tag.name).group('level')) + 1), tag.text))
125+
cells.append(cell)
126+
elif tag.name == 'p':
127+
if 'Copyright' in tag.text:
128+
continue
129+
cell = deepcopy(cell_markdown)
130+
cell['source'].append(tag.text)
131+
cells.append(cell)
132+
elif tag.name == 'pre':
133+
if '>>>' not in tag.text:
134+
# code
135+
source = [re.sub('(^\n*|\n*$)', '', tag.text)]
136+
else:
137+
# idle
138+
source = []
139+
for line in tag.text.split('\n'):
140+
if re.search('^(>|\.){3}', line):
141+
if re.search('^(>|\.){3}\s*$', line):
142+
continue
143+
source.append(re.sub('^(>|\.){3} ', '', line))
144+
else:
145+
if source:
146+
cell = deepcopy(cell_code)
147+
cell['source'].append(re.sub('(^\n*|\n*$)', '', '\n'.join(source)))
148+
cells.append(cell)
149+
source = []
150+
else:
151+
continue
152+
if source:
153+
cell = deepcopy(cell_code)
154+
cell['source'].append('\n'.join(source))
155+
cells.append(cell)
156+
for cell in cells:
157+
for i, text in enumerate(cell['source']):
158+
cell['source'][i] = text.replace('¶', '')
159+
return cells
160+
161+
162+
def fetch_all(sep=False):
163+
content = requests.get('https://python3-cookbook.readthedocs.io/zh_CN/latest/').content
164+
soup = BeautifulSoup(content)
165+
for x in soup.find_all('a', class_='reference internal', href=re.compile('chapters/p\d+'))[2:15]:
166+
ch = Chapter('https://python3-cookbook.readthedocs.io/zh_CN/latest/' + x['href'])
167+
ch.fetch_list()
168+
ch.fetch_sections(sep=sep)
169+
170+
171+
if __name__ == '__main__':
172+
# ch = Chapter('https://python3-cookbook.readthedocs.io/zh_CN/latest/chapters/p01_data_structures_algorithms.html')
173+
# ch.fetch_list()
174+
# ch.fetch_sections()
175+
fetch_all(sep=True)

0 commit comments

Comments
 (0)