Skip to content

Commit c213d1c

Browse files
committed
feat(nytimes): update news_bot.py
1 parent 6280652 commit c213d1c

File tree

1 file changed

+42
-21
lines changed

1 file changed

+42
-21
lines changed

scripts/nytimes/news_bot.py

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
from bs4 import BeautifulSoup
33
import os
44
from dotenv import load_dotenv
5-
import ssl
65
import datetime
76
import sys
7+
import re
88

99
load_dotenv()
1010

@@ -34,9 +34,8 @@ def send_telegram_message(message):
3434
def fetch_html_content(url):
3535
"""Fetches the HTML content of a given URL."""
3636
try:
37-
context = ssl._create_unverified_context()
3837
print(f"Fetching HTML content from: {url}")
39-
response = requests.get(url, verify=False)
38+
response = requests.get(url, timeout=15)
4039
response.raise_for_status()
4140
print(f"Successfully fetched HTML content from: {url}")
4241
return response.text
@@ -48,45 +47,64 @@ def extract_hacker_news_links(html):
4847
"""Extracts links from Hacker News."""
4948
soup = BeautifulSoup(html, 'html.parser')
5049
links = []
50+
seen = set()
5151
for item in soup.select('.titleline > a'):
5252
url = item['href']
5353
title = item.text.strip()
5454
if url.startswith('item?id='): # Handle internal HN links
5555
url = f"https://news.ycombinator.com/{url}"
56-
links.append({'url': url, 'text': title})
56+
# Avoid duplicates and empty titles
57+
if url not in seen and title:
58+
links.append({'url': url, 'text': title})
59+
seen.add(url)
60+
if len(links) >= 5:
61+
break
5762
print(f"Extracted {len(links)} links from Hacker News.")
58-
return links[:5] # Limit to top 5 stories
63+
return links
5964

6065
def extract_github_trending(html):
6166
"""Extracts trending repositories from GitHub."""
6267
soup = BeautifulSoup(html, 'html.parser')
6368
links = []
64-
for repo in soup.select('article.Box-row h1 a'):
69+
for repo in soup.select('article.Box-row h2 a'):
6570
url = f"https://github.com{repo['href']}"
66-
title = repo.text.strip().replace('\n', '').replace(' ', '')
67-
links.append({'url': url, 'text': title})
71+
# Clean up repo name: remove extra whitespace and newlines
72+
title = re.sub(r'\s+', ' ', repo.text).strip()
73+
if title and url:
74+
links.append({'url': url, 'text': title})
75+
if len(links) >= 5:
76+
break
6877
print(f"Extracted {len(links)} trending repositories from GitHub.")
69-
return links[:5] # Limit to top 5 repos
78+
return links
7079

7180
def extract_nytimes_links(html):
7281
"""Extracts links from the main page of cn.nytimes.com."""
7382
soup = BeautifulSoup(html, 'html.parser')
7483
links = []
84+
seen = set()
85+
# NYTimes mobile Chinese site: look for news article links in <section> or <article>
7586
for a in soup.find_all('a', href=True):
7687
url = a['href']
77-
if url.startswith('https://m.cn.nytimes.com/'):
78-
links.append({
79-
'url': url,
80-
'text': a.text.strip()
81-
})
88+
text = a.get_text(strip=True)
89+
# Only keep links that look like news articles and have non-empty text
90+
if url.startswith('https://m.cn.nytimes.com/') and text and url not in seen:
91+
links.append({'url': url, 'text': text})
92+
seen.add(url)
93+
if len(links) >= 5:
94+
break
8295
print(f"Extracted {len(links)} links from NYTimes.")
83-
return links[:5] # Limit to top 5 articles
96+
return links
8497

8598
def generate_markdown_report(articles, source_name):
8699
"""Generates a Markdown report for the given articles."""
87100
markdown = f"### {source_name}\n\n"
101+
if not articles:
102+
markdown += "_No items found._\n\n"
103+
return markdown
88104
for article in articles:
89-
markdown += f"- {article['text']} ({article['url']})\n"
105+
# Escape parentheses in text to avoid Markdown link issues
106+
safe_text = article['text'].replace('(', '\\(').replace(')', '\\)')
107+
markdown += f"- [{safe_text}]({article['url']})\n"
90108
return markdown + "\n"
91109

92110
def main():
@@ -96,24 +114,27 @@ def main():
96114

97115
# Hacker News
98116
hn_html = fetch_html_content('https://news.ycombinator.com')
117+
hn_links = []
99118
if hn_html:
100119
hn_links = extract_hacker_news_links(hn_html)
101-
markdown_report += generate_markdown_report(hn_links, "Hacker News")
120+
markdown_report += generate_markdown_report(hn_links, "Hacker News")
102121

103122
# GitHub Trending
104123
gh_html = fetch_html_content('https://github.com/trending')
124+
gh_links = []
105125
if gh_html:
106126
gh_links = extract_github_trending(gh_html)
107-
markdown_report += generate_markdown_report(gh_links, "GitHub Trending")
127+
markdown_report += generate_markdown_report(gh_links, "GitHub Trending")
108128

109129
# NYTimes
110130
nytimes_html = fetch_html_content('https://m.cn.nytimes.com')
131+
nytimes_links = []
111132
if nytimes_html:
112133
nytimes_links = extract_nytimes_links(nytimes_html)
113-
markdown_report += generate_markdown_report(nytimes_links, "NYTimes (Chinese)")
134+
markdown_report += generate_markdown_report(nytimes_links, "NYTimes (Chinese)")
114135

115-
# Send report to Telegram
116-
if markdown_report.strip() != f"# Daily News Summary - {today}":
136+
# Only send if at least one section has news
137+
if any([hn_links, gh_links, nytimes_links]):
117138
if send_telegram_message(markdown_report):
118139
print("Daily news report sent to Telegram successfully.")
119140
sys.exit(0)

0 commit comments

Comments
 (0)