Skip to content

Commit 60b2e8e

Browse files
committed
update icon and today top news with ai
1 parent 869eb95 commit 60b2e8e

11 files changed

+390
-128
lines changed

app.py

+113-43
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from common import *
1616
from pydantic import BaseModel
1717
from json_repair import repair_json
18+
from parse_detail import parse_detail
19+
from feedgen.feed import FeedGenerator
1820
app = FastAPI()
1921
app.add_middleware(
2022
CORSMiddleware,
@@ -76,38 +78,22 @@ async def post_feedback(feedback: Feedback):
7678
async def get_cards():
7779
data = await redis_client.get("card_table")
7880
return {"code": 200, "msg": "success", "data": json.loads(data)}
79-
@app.get("/todayTopNews")
80-
async def getTodayTopNews():
81-
todayTopNewsData = await redis_client.get("todayTopNews")
82-
if todayTopNewsData:
83-
return {"code": 200, "msg": "success", "data": json.loads(todayTopNewsData)}
84-
else:
85-
mongoData = await get_data("hot")
86-
data = mongoData['data']
87-
filtered_sites = [site for site in data if site["name"] in news_sites]
88-
async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client: # 增加超时时间
81+
async def chatWithModel(messages, check_list=True):
82+
err = 10
83+
while err > 0:
84+
async with httpx.AsyncClient(timeout=httpx.Timeout(360.0)) as client:
8985
try:
9086
response = await client.post(
9187
api_url,
9288
headers=api_headers,
9389
json={
9490
"model": "gpt-4o",
95-
"messages": [
96-
{
97-
"role": "system",
98-
"content": "你是一个新闻专家,熟悉各种新闻编写手段,并且熟知全球时事"
99-
},
100-
{
101-
"role": "user",
102-
"content": "请从下方数据中选出10条你认为最应该让我知道的内容,返回json格式数据,返回格式[{hot_lable:'',hot_url:'',hot_value:''}]\ndata:" + json.dumps(filtered_sites)
103-
}
104-
],
91+
"messages": messages,
10592
"stream": True,
10693
"temperature": 0,
10794
"response_format": {"type": "json_object"}
10895
}
10996
)
110-
11197
text = ""
11298
try:
11399
async for line in response.aiter_lines():
@@ -117,28 +103,104 @@ async def getTodayTopNews():
117103
text += chunk
118104
except httpx.ReadTimeout:
119105
print("Stream reading timed out, using partial response")
120-
106+
err -= 1
107+
continue
121108
if not text:
122-
return {"code": 500, "msg": "Failed to get response from API", "data": []}
123-
124-
todayTopNewsData = json.loads(repair_json(text))
125-
if "messages" in todayTopNewsData:
126-
todayTopNewsData = todayTopNewsData["messages"]
127-
if len(todayTopNewsData) == 1:
128-
todayTopNewsData = todayTopNewsData[0]
129-
if "content" in todayTopNewsData:
130-
todayTopNewsData = todayTopNewsData['content']
131-
# 缓存结果到 Redis
132-
await redis_client.setex("todayTopNews", 3600, json.dumps(todayTopNewsData))
133-
134-
return {"code": 200, "msg": "success", "data": todayTopNewsData}
135-
136-
except httpx.RequestError as e:
137-
print(f"API request failed: {str(e)}")
138-
return {"code": 500, "msg": f"API request failed: {str(e)}", "data": []}
139-
except json.JSONDecodeError as e:
140-
print(f"Failed to parse API response: {str(e)}")
141-
return {"code": 500, "msg": f"Failed to parse API response: {str(e)}", "data": []}
109+
err -= 1
110+
continue
111+
if check_list and "hot_value" not in text:
112+
err -= 1
113+
continue
114+
if not check_list and "hot_tag" not in text:
115+
err -= 1
116+
continue
117+
return text
118+
except Exception as e:
119+
print(e)
120+
err -= 1
121+
return ""
122+
@app.get("/todayTopNews")
123+
async def getTodayTopNews():
124+
todayTopNewsData = await redis_client.get("todayTopNews")
125+
if todayTopNewsData:
126+
return {"code": 200, "msg": "success", "data": json.loads(todayTopNewsData)}
127+
else:
128+
mongoData = await get_data("hot")
129+
data = mongoData['data']
130+
filtered_sites = [site for site in data if site["name"] in news_sites]
131+
try:
132+
text = await chatWithModel([
133+
{
134+
"role": "system",
135+
"content": "你是一个新闻专家,熟悉各种新闻编写手段,并且熟知全球时事"
136+
},
137+
{
138+
"role": "user",
139+
"content": "请从下方数据中选出10条你认为最应该让我知道的内容,返回json格式数据,返回格式{'hot_topics': [{hot_lable:'',hot_url:'',hot_value:''}]}\ndata:" + json.dumps(filtered_sites)
140+
}
141+
])
142+
todayTopNewsData = json.loads(repair_json(text))
143+
if "messages" in todayTopNewsData:
144+
todayTopNewsData = todayTopNewsData["messages"]
145+
if len(todayTopNewsData) == 1:
146+
todayTopNewsData = todayTopNewsData[0]
147+
if "content" in todayTopNewsData:
148+
todayTopNewsData = todayTopNewsData['content']
149+
if "hot_topics" in todayTopNewsData:
150+
todayTopNewsData = todayTopNewsData['hot_topics']
151+
needKnows = await parse_detail(todayTopNewsData)
152+
summarizes = []
153+
for needKnow in needKnows:
154+
err = 3
155+
while err > 0:
156+
try:
157+
summarize = await chatWithModel([
158+
{
159+
"role": "system",
160+
"content": "你是一个新闻专家,熟悉各种新闻编写手段,并且熟知全球时事,并且有丰富的内容总结经验"
161+
},
162+
{
163+
"role": "user",
164+
"content": "对下方数据的content进行300字左右的高效总结,并增加一个tag,作为hot_content的值,以json格式返回,返回格式{hot_lable:'',hot_url:'',hot_value:'',hot_content:'',hot_tag:''}\ndata:" + json.dumps(needKnow)
165+
}
166+
], False)
167+
summarize = json.loads(repair_json(summarize))
168+
if "messages" in summarize:
169+
summarize = summarize["messages"]
170+
if len(summarize) == 1:
171+
summarize = summarize[0]
172+
if "content" in summarize:
173+
summarize = summarize['content']
174+
del needKnow['content']
175+
needKnow['hot_content'] = summarize['hot_content']
176+
needKnow['hot_tag'] = summarize['hot_tag']
177+
summarizes.append(needKnow)
178+
break
179+
except Exception as e:
180+
print(e)
181+
err -= 1
182+
await redis_client.setex("todayTopNews", 3600, json.dumps(summarizes))
183+
fg = FeedGenerator()
184+
fg.title('todayTopNewsWithAI')
185+
fg.link(href='https://www.hotday.uk')
186+
fg.description('Today top news with AI')
187+
for item in summarizes:
188+
fe = fg.add_entry()
189+
fe.title(item.get('title', item['hot_lable']))
190+
fe.link(href=item.get('url', item['hot_url']))
191+
fe.description(item.get('description', item['hot_content']))
192+
fg.rss_file('rss_feed_today_top_news.xml')
193+
return {"code": 200, "msg": "success", "data": summarizes}
194+
195+
except httpx.RequestError as e:
196+
print(f"API request failed: {str(e)}")
197+
return {"code": 500, "msg": f"API request failed: {str(e)}", "data": []}
198+
except json.JSONDecodeError as e:
199+
print(f"Failed to parse API response: {str(e)}")
200+
return {"code": 500, "msg": f"Failed to parse API response: {str(e)}", "data": []}
201+
except Exception as e:
202+
print(f"some error happen: {str(e)}")
203+
return {"code": 500, "msg": f"Some error happen: {str(e)}", "data": []}
142204

143205
@app.get("/rank/{item_id}")
144206
async def get_data(item_id: str):
@@ -261,7 +323,15 @@ async def get_data(item_id: str):
261323
except Exception as e:
262324
# 记录日志或处理 Redis 错误
263325
print(f"Redis setex error: {e}")
264-
326+
fg = FeedGenerator()
327+
fg.title('todayTopNewsWithAI')
328+
fg.link(href='https://www.hotday.uk')
329+
fg.description('Today top news with AI')
330+
for item in data:
331+
fe = fg.add_entry()
332+
fe.title(item.get('title', item['hot_lable']))
333+
fe.link(href=item.get('url', item['hot_url']))
334+
fg.rss_file('rss_feed.xml')
265335
return {
266336
"code": 200,
267337
"msg": "success",

config.py.example

-4
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,8 @@ REDIS_PASSWORD = None
88
news_sites = [
99
"澎湃新闻",
1010
"36氪",
11-
"白鲸出海",
12-
"虎嗅热文",
1311
"IT之家热榜",
14-
"腾讯新闻热点榜",
1512
"少数派热榜",
16-
"头条热榜",
1713
"华尔街见闻"
1814
]
1915
EMAIL = {

parse_detail.py

+64-27
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,31 @@
33
import asyncio
44
from Crypto.Cipher import AES
55
from Crypto.Util.Padding import unpad
6-
import pyquery
76
import base64
87
import re
8+
import markdownify
9+
from bs4 import BeautifulSoup
10+
11+
async def remove_img_tags(html_content):
12+
if not html_content:
13+
return html_content
14+
soup = BeautifulSoup(html_content, 'html.parser')
15+
for img in soup.find_all('img'):
16+
img.decompose()
17+
return str(soup)
18+
919
async def parse_detail(needKnowList):
1020
for needKnow in needKnowList:
1121
if "thepaper.cn" in needKnow['hot_url']:
1222
needKnow = await parse_pengpai(needKnow)
1323
elif "36kr.com" in needKnow['hot_url']:
1424
needKnow = await parse_36kr(needKnow)
25+
elif "ithome.com" in needKnow['hot_url']:
26+
needKnow = await parse_ithome(needKnow)
27+
elif "sspai.com" in needKnow['hot_url']:
28+
needKnow = await parse_sspai(needKnow)
29+
elif "wallstreetcn.com" in needKnow['hot_url']:
30+
needKnow = await parse_awatmt(needKnow)
1531
return needKnowList
1632

1733

@@ -24,8 +40,17 @@ async def fetch(url):
2440
async def parse_pengpai(needKnow):
2541
url = needKnow['hot_url']
2642
res = await fetch(url)
27-
pq = pyquery.PyQuery(res)
28-
detail = pq("h1~div").text()
43+
soup = BeautifulSoup(res, 'html.parser')
44+
try:
45+
detail = soup.select_one("div[class^='index_cententWrap']")
46+
if not detail:
47+
detail = soup.select_one("div[class^='header_videoWrap'] ~ div")
48+
if detail:
49+
detail = str(detail)
50+
except:
51+
return needKnow
52+
detail = await remove_img_tags(detail)
53+
detail = markdownify.markdownify(detail).strip()
2954
needKnow['content'] = detail
3055
return needKnow
3156

@@ -41,31 +66,43 @@ async def parse_36kr(needKnow):
4166
decrypted_bytes = unpad(decrypted_padded, AES.block_size)
4267
decrypted_text = decrypted_bytes.decode('utf-8')
4368
state_dict = json.loads(decrypted_text)
44-
needKnow['content'] = state_dict['articleDetail']['articleDetailData']['data']['widgetContent']
69+
detail = state_dict['articleDetail']['articleDetailData']['data']['widgetContent']
70+
detail = await remove_img_tags(detail)
71+
needKnow['content'] = markdownify.markdownify(detail).strip()
4572
return needKnow
4673

47-
async def main():
48-
# 测试数据
49-
test_data = [
50-
{
51-
'hot_url': 'https://www.36kr.com/p/3061699221143300',
52-
'title': 'Test Article 1'
53-
},
54-
{
55-
'hot_url': 'https://www.36kr.com/p/3062989680862336',
56-
'title': 'Test Article 2'
57-
}
58-
]
59-
60-
61-
results = await parse_detail(test_data)
62-
63-
64-
for result in results:
65-
print(f"Title: {result['title']}")
66-
print(f"Content: {result['content'][:100]}...")
67-
print("-" * 50)
74+
async def parse_ithome(needKnow):
75+
url = needKnow['hot_url']
76+
res = await fetch(url)
77+
soup = BeautifulSoup(res, 'html.parser')
78+
detail = soup.select_one(".news-content")
79+
if detail:
80+
detail = str(detail)
81+
detail = await remove_img_tags(detail)
82+
detail = markdownify.markdownify(detail).strip()
83+
needKnow['content'] = detail
84+
return needKnow
6885

86+
async def parse_sspai(needKnow):
87+
url = needKnow['hot_url']
88+
res = await fetch(url)
89+
soup = BeautifulSoup(res, 'html.parser')
90+
detail = soup.select_one("div.content")
91+
if detail:
92+
detail = str(detail)
93+
detail = await remove_img_tags(detail)
94+
detail = markdownify.markdownify(detail).strip()
95+
needKnow['content'] = detail
96+
return needKnow
6997

70-
if __name__ == "__main__":
71-
asyncio.run(main())
98+
async def parse_awatmt(needKnow):
99+
url = needKnow['hot_url']
100+
artile_id = url.split("?")[0].split("/")[-1]
101+
url = f"https://api-one-wscn.awtmt.com/apiv1/content/articles/{artile_id}?extract=0"
102+
res = await fetch(url)
103+
res_json = json.loads(res)
104+
detail = res_json['data']['content']
105+
detail = await remove_img_tags(detail)
106+
detail = markdownify.markdownify(detail).strip()
107+
needKnow['content'] = detail
108+
return needKnow

0 commit comments

Comments
 (0)