feat(nytimes): update news_bot.py

lzwjava · lzwjava · commit c213d1c4e7a6 · 2025-06-06T00:22:17.000+08:00
diff --git a/scripts/nytimes/news_bot.py b/scripts/nytimes/news_bot.py
@@ -2,9 +2,9 @@
 from bs4 import BeautifulSoup
 import os
 from dotenv import load_dotenv
-import ssl
 import datetime
 import sys
+import re
 
 load_dotenv()
 
@@ -34,9 +34,8 @@ def send_telegram_message(message):
 def fetch_html_content(url):
     """Fetches the HTML content of a given URL."""
     try:
-        context = ssl._create_unverified_context()
         print(f"Fetching HTML content from: {url}")
-        response = requests.get(url, verify=False)
+        response = requests.get(url, timeout=15)
         response.raise_for_status()
         print(f"Successfully fetched HTML content from: {url}")
         return response.text
@@ -48,45 +47,64 @@ def extract_hacker_news_links(html):
     """Extracts links from Hacker News."""
     soup = BeautifulSoup(html, 'html.parser')
     links = []
+    seen = set()
     for item in soup.select('.titleline > a'):
         url = item['href']
         title = item.text.strip()
         if url.startswith('item?id='):  # Handle internal HN links
             url = f"https://news.ycombinator.com/{url}"
-        links.append({'url': url, 'text': title})
+        # Avoid duplicates and empty titles
+        if url not in seen and title:
+            links.append({'url': url, 'text': title})
+            seen.add(url)
+        if len(links) >= 5:
+            break
     print(f"Extracted {len(links)} links from Hacker News.")
-    return links[:5]  # Limit to top 5 stories
+    return links
 
 def extract_github_trending(html):
     """Extracts trending repositories from GitHub."""
     soup = BeautifulSoup(html, 'html.parser')
     links = []
-    for repo in soup.select('article.Box-row h1 a'):
+    for repo in soup.select('article.Box-row h2 a'):
         url = f"https://github.com{repo['href']}"
-        title = repo.text.strip().replace('\n', '').replace(' ', '')
-        links.append({'url': url, 'text': title})
+        # Clean up repo name: remove extra whitespace and newlines
+        title = re.sub(r'\s+', ' ', repo.text).strip()
+        if title and url:
+            links.append({'url': url, 'text': title})
+        if len(links) >= 5:
+            break
     print(f"Extracted {len(links)} trending repositories from GitHub.")
-    return links[:5]  # Limit to top 5 repos
+    return links
 
 def extract_nytimes_links(html):
     """Extracts links from the main page of cn.nytimes.com."""
     soup = BeautifulSoup(html, 'html.parser')
     links = []
+    seen = set()
+    # NYTimes mobile Chinese site: look for news article links in <section> or <article>
     for a in soup.find_all('a', href=True):
         url = a['href']
-        if url.startswith('https://m.cn.nytimes.com/'):
-            links.append({
-                'url': url,
-                'text': a.text.strip()
-            })
+        text = a.get_text(strip=True)
+        # Only keep links that look like news articles and have non-empty text
+        if url.startswith('https://m.cn.nytimes.com/') and text and url not in seen:
+            links.append({'url': url, 'text': text})
+            seen.add(url)
+        if len(links) >= 5:
+            break
     print(f"Extracted {len(links)} links from NYTimes.")
-    return links[:5]  # Limit to top 5 articles
+    return links
 
 def generate_markdown_report(articles, source_name):
     """Generates a Markdown report for the given articles."""
     markdown = f"### {source_name}\n\n"
+    if not articles:
+        markdown += "_No items found._\n\n"
+        return markdown
     for article in articles:
-        markdown += f"- {article['text']} ({article['url']})\n"
+        # Escape parentheses in text to avoid Markdown link issues
+        safe_text = article['text'].replace('(', '\\(').replace(')', '\\)')
+        markdown += f"- [{safe_text}]({article['url']})\n"
     return markdown + "\n"
 
 def main():
@@ -96,24 +114,27 @@ def main():
 
     # Hacker News
     hn_html = fetch_html_content('https://news.ycombinator.com')
+    hn_links = []
     if hn_html:
         hn_links = extract_hacker_news_links(hn_html)
-        markdown_report += generate_markdown_report(hn_links, "Hacker News")
+    markdown_report += generate_markdown_report(hn_links, "Hacker News")
 
     # GitHub Trending
     gh_html = fetch_html_content('https://github.com/trending')
+    gh_links = []
     if gh_html:
         gh_links = extract_github_trending(gh_html)
-        markdown_report += generate_markdown_report(gh_links, "GitHub Trending")
+    markdown_report += generate_markdown_report(gh_links, "GitHub Trending")
 
     # NYTimes
     nytimes_html = fetch_html_content('https://m.cn.nytimes.com')
+    nytimes_links = []
     if nytimes_html:
         nytimes_links = extract_nytimes_links(nytimes_html)
-        markdown_report += generate_markdown_report(nytimes_links, "NYTimes (Chinese)")
+    markdown_report += generate_markdown_report(nytimes_links, "NYTimes (Chinese)")
 
-    # Send report to Telegram
-    if markdown_report.strip() != f"# Daily News Summary - {today}":
+    # Only send if at least one section has news
+    if any([hn_links, gh_links, nytimes_links]):
         if send_telegram_message(markdown_report):
             print("Daily news report sent to Telegram successfully.")
             sys.exit(0)