datehoer
diff --git a/‎app.py
+113-43 b/‎app.py
+113-43
diff --git a/‎config.py.example
-4 b/‎config.py.example
-4
diff --git a/‎parse_detail.py
+64-27 b/‎parse_detail.py
+64-27
@@ -15,6 +15,8 @@
 from common import *
 from pydantic import BaseModel
 from json_repair import repair_json
+from parse_detail import parse_detail
+from feedgen.feed import FeedGenerator
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
@@ -76,38 +78,22 @@ async def post_feedback(feedback: Feedback):
 async def get_cards():
     data = await redis_client.get("card_table")
     return {"code": 200, "msg": "success", "data": json.loads(data)}
-@app.get("/todayTopNews")
-async def getTodayTopNews():
-    todayTopNewsData = await redis_client.get("todayTopNews")
-    if todayTopNewsData:
-        return {"code": 200, "msg": "success", "data": json.loads(todayTopNewsData)}
-    else:
-        mongoData = await get_data("hot")
-        data = mongoData['data']
-        filtered_sites = [site for site in data if site["name"] in news_sites]
-        async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:  # 增加超时时间
+async def chatWithModel(messages, check_list=True):
+    err = 10
+    while err > 0:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(360.0)) as client:
             try:
                 response = await client.post(
                     api_url,
                     headers=api_headers,
                     json={
                         "model": "gpt-4o",
-                        "messages": [
-                            {
-                                "role": "system",
-                                "content": "你是一个新闻专家,熟悉各种新闻编写手段,并且熟知全球时事"
-                            },
-                            {
-                                "role": "user",
-                                "content": "请从下方数据中选出10条你认为最应该让我知道的内容,返回json格式数据,返回格式[{hot_lable:'',hot_url:'',hot_value:''}]\ndata:" + json.dumps(filtered_sites)
-                            }
-                        ],
+                        "messages": messages,
                         "stream": True,
                         "temperature": 0,
                         "response_format": {"type": "json_object"}
                     }
                 )
-                
                 text = ""
                 try:
                     async for line in response.aiter_lines():
@@ -117,28 +103,104 @@ async def getTodayTopNews():
                             text += chunk
                 except httpx.ReadTimeout:
                     print("Stream reading timed out, using partial response")
-                    
+                    err -= 1
+                    continue
                 if not text:
-                    return {"code": 500, "msg": "Failed to get response from API", "data": []}
-                
-                todayTopNewsData = json.loads(repair_json(text))
-                if "messages" in todayTopNewsData:
-                    todayTopNewsData = todayTopNewsData["messages"]
-                    if len(todayTopNewsData) == 1:
-                        todayTopNewsData = todayTopNewsData[0]
-                        if "content" in todayTopNewsData:
-                            todayTopNewsData = todayTopNewsData['content']
-                # 缓存结果到 Redis
-                await redis_client.setex("todayTopNews", 3600, json.dumps(todayTopNewsData))
-                
-                return {"code": 200, "msg": "success", "data": todayTopNewsData}
-                
-            except httpx.RequestError as e:
-                print(f"API request failed: {str(e)}")
-                return {"code": 500, "msg": f"API request failed: {str(e)}", "data": []}
-            except json.JSONDecodeError as e:
-                print(f"Failed to parse API response: {str(e)}")
-                return {"code": 500, "msg": f"Failed to parse API response: {str(e)}", "data": []}
+                    err -= 1
+                    continue
+                if check_list and "hot_value" not in text:
+                    err -= 1
+                    continue
+                if not check_list and "hot_tag" not in text:
+                    err -= 1
+                    continue
+                return text
+            except Exception as e:
+                print(e)
+                err -= 1
+    return ""
+@app.get("/todayTopNews")
+async def getTodayTopNews():
+    todayTopNewsData = await redis_client.get("todayTopNews")
+    if todayTopNewsData:
+        return {"code": 200, "msg": "success", "data": json.loads(todayTopNewsData)}
+    else:
+        mongoData = await get_data("hot")
+        data = mongoData['data']
+        filtered_sites = [site for site in data if site["name"] in news_sites]
+        try:
+            text = await chatWithModel([
+                    {
+                        "role": "system",
+                        "content": "你是一个新闻专家,熟悉各种新闻编写手段,并且熟知全球时事"
+                    },
+                    {
+                        "role": "user",
+                        "content": "请从下方数据中选出10条你认为最应该让我知道的内容,返回json格式数据,返回格式{'hot_topics': [{hot_lable:'',hot_url:'',hot_value:''}]}\ndata:" + json.dumps(filtered_sites)
+                    }
+                ])
+            todayTopNewsData = json.loads(repair_json(text))
+            if "messages" in todayTopNewsData:
+                todayTopNewsData = todayTopNewsData["messages"]
+                if len(todayTopNewsData) == 1:
+                    todayTopNewsData = todayTopNewsData[0]
+                    if "content" in todayTopNewsData:
+                        todayTopNewsData = todayTopNewsData['content']
+            if "hot_topics" in todayTopNewsData:
+                todayTopNewsData = todayTopNewsData['hot_topics']
+            needKnows = await parse_detail(todayTopNewsData)
+            summarizes = []
+            for needKnow in needKnows:
+                err = 3
+                while err > 0:
+                    try:
+                        summarize = await chatWithModel([
+                            {
+                                "role": "system",
+                                "content": "你是一个新闻专家,熟悉各种新闻编写手段,并且熟知全球时事,并且有丰富的内容总结经验"
+                            },
+                            {
+                                "role": "user",
+                                "content": "对下方数据的content进行300字左右的高效总结,并增加一个tag,作为hot_content的值,以json格式返回,返回格式{hot_lable:'',hot_url:'',hot_value:'',hot_content:'',hot_tag:''}\ndata:" + json.dumps(needKnow)
+                            }
+                        ], False)
+                        summarize = json.loads(repair_json(summarize))
+                        if "messages" in summarize:
+                            summarize = summarize["messages"]
+                            if len(summarize) == 1:
+                                summarize = summarize[0]
+                                if "content" in summarize:
+                                    summarize = summarize['content']
+                        del needKnow['content']
+                        needKnow['hot_content'] = summarize['hot_content']
+                        needKnow['hot_tag'] = summarize['hot_tag']
+                        summarizes.append(needKnow)
+                        break
+                    except Exception as e:
+                        print(e)
+                        err -= 1
+            await redis_client.setex("todayTopNews", 3600, json.dumps(summarizes))
+            fg = FeedGenerator()
+            fg.title('todayTopNewsWithAI')
+            fg.link(href='https://www.hotday.uk')
+            fg.description('Today top news with AI')
+            for item in summarizes:
+                fe = fg.add_entry()
+                fe.title(item.get('title', item['hot_lable']))
+                fe.link(href=item.get('url', item['hot_url']))
+                fe.description(item.get('description', item['hot_content']))
+            fg.rss_file('rss_feed_today_top_news.xml')
+            return {"code": 200, "msg": "success", "data": summarizes}
+            
+        except httpx.RequestError as e:
+            print(f"API request failed: {str(e)}")
+            return {"code": 500, "msg": f"API request failed: {str(e)}", "data": []}
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse API response: {str(e)}")
+            return {"code": 500, "msg": f"Failed to parse API response: {str(e)}", "data": []}
+        except Exception as e:
+            print(f"some error happen: {str(e)}")
+            return {"code": 500, "msg": f"Some error happen: {str(e)}", "data": []}
 
 @app.get("/rank/{item_id}")
 async def get_data(item_id: str):
@@ -261,7 +323,15 @@ async def get_data(item_id: str):
         except Exception as e:
             # 记录日志或处理 Redis 错误
             print(f"Redis setex error: {e}")
-
+        fg = FeedGenerator()
+        fg.title('todayTopNewsWithAI')
+        fg.link(href='https://www.hotday.uk')
+        fg.description('Today top news with AI')
+        for item in data:
+            fe = fg.add_entry()
+            fe.title(item.get('title', item['hot_lable']))
+            fe.link(href=item.get('url', item['hot_url']))
+        fg.rss_file('rss_feed.xml')
         return {
             "code": 200,
             "msg": "success",
 
@@ -8,12 +8,8 @@ REDIS_PASSWORD = None
 news_sites = [
     "澎湃新闻",
     "36氪",
-    "白鲸出海",
-    "虎嗅热文",
     "IT之家热榜",
-    "腾讯新闻热点榜",
     "少数派热榜",
-    "头条热榜",
     "华尔街见闻"
 ]
 EMAIL = {
 
@@ -3,15 +3,31 @@
 import asyncio
 from Crypto.Cipher import AES
 from Crypto.Util.Padding import unpad
-import pyquery
 import base64
 import re
+import markdownify
+from bs4 import BeautifulSoup
+
+async def remove_img_tags(html_content):
+    if not html_content:
+        return html_content
+    soup = BeautifulSoup(html_content, 'html.parser')
+    for img in soup.find_all('img'):
+        img.decompose()
+    return str(soup)
+
 async def parse_detail(needKnowList):
     for needKnow in needKnowList:
         if "thepaper.cn" in needKnow['hot_url']:
             needKnow = await parse_pengpai(needKnow)
         elif "36kr.com" in needKnow['hot_url']:
             needKnow = await parse_36kr(needKnow)
+        elif "ithome.com" in needKnow['hot_url']:
+            needKnow = await parse_ithome(needKnow)
+        elif "sspai.com" in needKnow['hot_url']:
+            needKnow = await parse_sspai(needKnow)
+        elif "wallstreetcn.com" in needKnow['hot_url']:
+            needKnow = await parse_awatmt(needKnow)
     return needKnowList
 
 
@@ -24,8 +40,17 @@ async def fetch(url):
 async def parse_pengpai(needKnow):
     url = needKnow['hot_url']
     res = await fetch(url)
-    pq = pyquery.PyQuery(res)
-    detail = pq("h1~div").text()
+    soup = BeautifulSoup(res, 'html.parser')
+    try:
+        detail = soup.select_one("div[class^='index_cententWrap']")
+        if not detail:
+            detail = soup.select_one("div[class^='header_videoWrap'] ~ div")
+        if detail:
+            detail = str(detail)
+    except:
+        return needKnow
+    detail = await remove_img_tags(detail)
+    detail = markdownify.markdownify(detail).strip()
     needKnow['content'] = detail
     return needKnow
 
@@ -41,31 +66,43 @@ async def parse_36kr(needKnow):
     decrypted_bytes = unpad(decrypted_padded, AES.block_size)
     decrypted_text = decrypted_bytes.decode('utf-8')
     state_dict = json.loads(decrypted_text)
-    needKnow['content'] = state_dict['articleDetail']['articleDetailData']['data']['widgetContent']
+    detail = state_dict['articleDetail']['articleDetailData']['data']['widgetContent']
+    detail = await remove_img_tags(detail)
+    needKnow['content'] = markdownify.markdownify(detail).strip()
     return needKnow
 
-async def main():
-    # 测试数据
-    test_data = [
-        {
-            'hot_url': 'https://www.36kr.com/p/3061699221143300',
-            'title': 'Test Article 1'
-        },
-        {
-            'hot_url': 'https://www.36kr.com/p/3062989680862336',
-            'title': 'Test Article 2'
-        }
-    ]
-    
-    
-    results = await parse_detail(test_data)
-    
-    
-    for result in results:
-        print(f"Title: {result['title']}")
-        print(f"Content: {result['content'][:100]}...")
-        print("-" * 50)
+async def parse_ithome(needKnow):
+    url = needKnow['hot_url']
+    res = await fetch(url)
+    soup = BeautifulSoup(res, 'html.parser')
+    detail = soup.select_one(".news-content")
+    if detail:
+        detail = str(detail)
+        detail = await remove_img_tags(detail)
+        detail = markdownify.markdownify(detail).strip()
+        needKnow['content'] = detail
+    return needKnow
 
+async def parse_sspai(needKnow):
+    url = needKnow['hot_url']
+    res = await fetch(url)
+    soup = BeautifulSoup(res, 'html.parser')
+    detail = soup.select_one("div.content")
+    if detail:
+        detail = str(detail)
+        detail = await remove_img_tags(detail)
+        detail = markdownify.markdownify(detail).strip()
+        needKnow['content'] = detail
+    return needKnow
 
-if __name__ == "__main__":
-    asyncio.run(main())
+async def parse_awatmt(needKnow):
+    url = needKnow['hot_url']
+    artile_id = url.split("?")[0].split("/")[-1]
+    url = f"https://api-one-wscn.awtmt.com/apiv1/content/articles/{artile_id}?extract=0"
+    res = await fetch(url)
+    res_json = json.loads(res)
+    detail = res_json['data']['content']
+    detail = await remove_img_tags(detail)
+    detail = markdownify.markdownify(detail).strip()
+    needKnow['content'] = detail
+    return needKnow