|
16 | 16 | )
|
17 | 17 |
|
18 | 18 | def scrape_huangli():
|
19 |
| - url = "https://www.huangli.com/" |
20 |
| - html_content = requests.get(url).text |
| 19 | + url = "https://www.laohuangli.net/" |
| 20 | + html_content = requests.get(url, proxies={'http': 'http://127.0.0.1:7890','https': 'http://127.0.0.1:7890'}).text |
21 | 21 | doc = pq(html_content)
|
22 | 22 | data = {}
|
23 | 23 |
|
24 |
| - # 解析左侧部分 |
25 |
| - left = doc('.lunars-b .l') |
26 |
| - |
27 |
| - # 提取左侧 Horoscope 信息 |
28 |
| - data['left_horoscopes'] = [] |
29 |
| - for elem in left('.horoscope').items(): |
30 |
| - data['left_horoscopes'].append({ |
31 |
| - 'hit': elem.find('.hit').text(), |
32 |
| - 'gz': elem.find('.gz').text(), |
33 |
| - 'zodiac': elem.find('.zodiac').text(), |
34 |
| - 'nayin': elem.find('.nayin').text() |
35 |
| - }) |
36 |
| - |
37 |
| - # 提取左侧 Solar 图片 |
38 |
| - data['left_solar'] = left('.solar img').attr('src') |
39 |
| - |
40 |
| - # 提取左侧宜忌 |
41 |
| - data['left_yi_ji'] = [a.text() for a in left('.yi-ji').eq(0).find('a').items()] |
42 |
| - |
43 |
| - # 提取左侧吉神宜趋 |
44 |
| - data['left_shen_sha'] = { |
45 |
| - 'title': left('.shen-sha .title span').text(), |
46 |
| - 'content': left('.shen-sha p').text() |
47 |
| - } |
48 |
| - |
49 |
| - # 提取左侧彭祖百忌和相冲 |
50 |
| - pz_chong = {} |
51 |
| - for elem in left('.pz-chong div').items(): |
52 |
| - title = elem.find('.title').text() |
53 |
| - text = elem.find('.text').text() |
54 |
| - pz_chong[title] = text |
55 |
| - data['left_pz_chong'] = pz_chong |
56 |
| - |
57 |
| - # 提取左侧月名、物候、月相 |
58 |
| - yz_wh_yx = [] |
59 |
| - for elem in left('.yz-wh-yx div').items(): |
60 |
| - yz_wh_yx.append({ |
61 |
| - 'title': elem.find('.title').text(), |
62 |
| - 'text': elem.find('.text').text(), |
63 |
| - 'img': elem.find('img').attr('src') |
64 |
| - }) |
65 |
| - data['left_yz_wh_yx'] = yz_wh_yx |
66 |
| - |
67 |
| - # 解析中间部分 |
68 |
| - center = doc('.lunars-b .c') |
69 |
| - center_top = center('.top') |
70 |
| - |
71 |
| - # 提取日期信息 |
72 |
| - data['center_datepicker'] = center_top('.form-data #datetimepicker').val() |
73 |
| - data['center_return_today'] = center_top('.form-a').text() |
74 |
| - |
75 |
| - # 提取今日幸运生肖 |
76 |
| - lucky_zodiac = { |
77 |
| - 'animals': [a.text() for a in center_top('.days-info .list').eq(0).find('.text a').items()], |
78 |
| - 'title': center_top('.days-info .list').eq(0).find('.title').text() |
79 |
| - } |
80 |
| - data['center_lucky_zodiac'] = lucky_zodiac |
81 |
| - |
82 |
| - # 提取今日星座 |
83 |
| - today_constellation = { |
84 |
| - 'constellation': center_top('.days-info .list').eq(1).find('.text').text(), |
85 |
| - 'title': center_top('.days-info .list').eq(1).find('.title').text() |
86 |
| - } |
87 |
| - data['center_today_constellation'] = today_constellation |
88 |
| - |
89 |
| - # 提取日期数字和农历日期 |
90 |
| - data['center_date_number'] = center_top('.days-info .su').text() |
91 |
| - data['center_lunar_date'] = center_top('h4').text() |
92 |
| - |
93 |
| - # 提取二维码链接 |
94 |
| - data['center_qr_codes'] = [] |
95 |
| - for elem in center_top('.down .QRcode').items(): |
96 |
| - data['center_qr_codes'].append({ |
97 |
| - 'type': elem.attr('data-type'), |
98 |
| - 'img_src': elem.find('img').attr('src') |
99 |
| - }) |
100 |
| - |
101 |
| - # 提取注意事项 |
102 |
| - data['center_note'] = { |
103 |
| - 'img': center_top('.note img').attr('src'), |
104 |
| - 'text': center_top('.note span').text() |
105 |
| - } |
106 |
| - |
107 |
| - # 提取底部信息 |
108 |
| - bottom = center('.bottom') |
109 |
| - |
110 |
| - # 提取财神位 |
111 |
| - data['bottom_caishen'] = [] |
112 |
| - for elem in bottom('.lunars-info.shen .list .item').items(): |
113 |
| - data['bottom_caishen'].append({ |
114 |
| - 'title': elem.find('.title').text(), |
115 |
| - 'text': elem.find('.text').text(), |
116 |
| - 'link': elem.attr('href') |
117 |
| - }) |
118 |
| - |
119 |
| - # 提取阴阳贵神 |
120 |
| - data['bottom_yinyang_guishen'] = [] |
121 |
| - for elem in bottom('.lunars-info').eq(1).find('.list .item').items(): |
122 |
| - data['bottom_yinyang_guishen'].append({ |
123 |
| - 'title': elem.find('.title').text(), |
124 |
| - 'text': elem.find('.text').text() |
125 |
| - }) |
126 |
| - |
127 |
| - # 提取空亡所值 |
128 |
| - data['bottom_kongwang_souzhi'] = [] |
129 |
| - for elem in bottom('.lunars-info').eq(2).find('.list .item').items(): |
130 |
| - data['bottom_kongwang_souzhi'].append({ |
131 |
| - 'title': elem.find('.title').text(), |
132 |
| - 'text': elem.find('.text').text() |
133 |
| - }) |
134 |
| - |
135 |
| - # 提取九宫飞星 |
136 |
| - data['bottom_jiugong_feixing'] = [ |
137 |
| - elem.text() for elem in bottom('.lunars-info').eq(3).find('.list .item .text').items() |
| 24 | + # 左侧年份、生肖、五行信息 |
| 25 | + left_divs = doc("td[class='tr-p0'] div") |
| 26 | + data['year_info'] = [ |
| 27 | + { |
| 28 | + "label": pq(span.eq(0)).text(), |
| 29 | + "zodiac": pq(span.eq(1)).text(), |
| 30 | + "element": pq(span.eq(2)).text() |
| 31 | + } |
| 32 | + for div in left_divs |
| 33 | + for span in [pq(div).find("span")] |
138 | 34 | ]
|
139 | 35 |
|
140 |
| - # 解析右侧部分 |
141 |
| - right = doc('.lunars-b .l.r') |
| 36 | + # 提取公历和农历信息 |
| 37 | + calendar_info = doc("td[class='bg-table'] .middle-rowspan") |
| 38 | + data['gregorian_calendar'] = calendar_info.find("p").eq(0).text() |
| 39 | + data['lunar_calendar'] = calendar_info.find("p").eq(1).text() |
142 | 40 |
|
143 |
| - # 提取右侧 Horoscope 信息 |
144 |
| - data['right_horoscopes'] = [] |
145 |
| - for elem in right('.horoscope').items(): |
146 |
| - data['right_horoscopes'].append({ |
147 |
| - 'hit': elem.find('.hit').text(), |
148 |
| - 'gz': elem.find('.gz').text(), |
149 |
| - 'zodiac': elem.find('.zodiac').text() |
150 |
| - }) |
| 41 | + # 提取宜和忌信息 |
| 42 | + good_div = doc(".table-three-div").eq(0) |
| 43 | + bad_div = doc(".table-three-div").eq(1) |
| 44 | + data['good_actions'] = [ |
| 45 | + pq(span).text() for span in good_div.find("span") if pq(span).text() |
| 46 | + ] |
| 47 | + data['bad_actions'] = [ |
| 48 | + pq(span).text() for span in bad_div.find("span") if pq(span).text() |
| 49 | + ] |
151 | 50 |
|
152 |
| - # 提取右侧 Solar 图片 |
153 |
| - data['right_solar'] = right('.solar img').attr('src') |
| 51 | + # 提取彭祖百忌和相冲 |
| 52 | + fourth_row = doc(".table-four-tr") |
| 53 | + data['pengzu_bai_ji'] = fourth_row.find(".col-td2").eq(0).find(".icon-none").text() |
| 54 | + data['xiang_chong'] = fourth_row.find(".col-td2").eq(1).find(".icon-none").text() |
154 | 55 |
|
155 |
| - # 提取右侧宜忌 |
156 |
| - data['right_yi_ji'] = [a.text() for a in right('.yi-ji a').items()] |
157 |
| - data['left_yi_ji'] = [a for a in data['left_yi_ji'] if a not in data['right_yi_ji']] |
158 |
| - # 提取右侧凶煞宜忌 |
159 |
| - data['right_shen_sha'] = { |
160 |
| - 'title': right('.shen-sha .title span').text(), |
161 |
| - 'content': right('.shen-sha p').text() |
162 |
| - } |
| 56 | + # 提取胎神信息 |
| 57 | + data['tai_shen'] = [ |
| 58 | + pq(span).text() |
| 59 | + for span in fourth_row.find(".col-td2").eq(2).find(".icon-none") |
| 60 | + ] |
163 | 61 |
|
164 |
| - # 提取右侧本月胎神和今日胎神 |
165 |
| - pz_chong_right = {} |
166 |
| - for elem in right('.pz-chong div').items(): |
167 |
| - title = elem.find('.title').text() |
168 |
| - text = elem.find('.text').text() |
169 |
| - pz_chong_right[title] = text |
170 |
| - data['right_pz_chong'] = pz_chong_right |
171 |
| - |
172 |
| - # 提取右侧岁煞、六耀、日禄 |
173 |
| - yz_wh_yx_right = [] |
174 |
| - for elem in right('.yz-wh-yx div').items(): |
175 |
| - yz_wh_yx_right.append({ |
176 |
| - 'title': elem.find('.title').text(), |
177 |
| - 'text': elem.find('.text').text(), |
178 |
| - 'img': elem.find('img').attr('src') |
179 |
| - }) |
180 |
| - data['right_yz_wh_yx'] = yz_wh_yx_right |
| 62 | + # 提取吉神和凶煞 |
| 63 | + fifth_row = doc(".table-five-div") |
| 64 | + data['ji_shen'] = fifth_row.eq(0).find("p").text().replace(",", "").split() |
| 65 | + data['xiong_sha'] = fifth_row.eq(2).find("p").text().replace(",", "").split() |
| 66 | + |
| 67 | + # 提取表格中的月份、物候等信息 |
| 68 | + month_table = doc("table") |
| 69 | + data['month_table'] = [ |
| 70 | + { |
| 71 | + pq(td.eq(0)).text(): pq(td.eq(1)).text() |
| 72 | + for td in pq(tr).find("td").items() |
| 73 | + } |
| 74 | + for tr in month_table.find("tr").items() |
| 75 | + ] |
181 | 76 |
|
182 | 77 | # 保存到 Redis
|
183 | 78 | json_result = json.dumps(data, ensure_ascii=False, indent=4)
|
|
0 commit comments