2
2
from bs4 import BeautifulSoup
3
3
import os
4
4
from dotenv import load_dotenv
5
- import ssl
6
5
import datetime
7
6
import sys
7
+ import re
8
8
9
9
load_dotenv ()
10
10
@@ -34,9 +34,8 @@ def send_telegram_message(message):
34
34
def fetch_html_content (url ):
35
35
"""Fetches the HTML content of a given URL."""
36
36
try :
37
- context = ssl ._create_unverified_context ()
38
37
print (f"Fetching HTML content from: { url } " )
39
- response = requests .get (url , verify = False )
38
+ response = requests .get (url , timeout = 15 )
40
39
response .raise_for_status ()
41
40
print (f"Successfully fetched HTML content from: { url } " )
42
41
return response .text
@@ -48,45 +47,64 @@ def extract_hacker_news_links(html):
48
47
"""Extracts links from Hacker News."""
49
48
soup = BeautifulSoup (html , 'html.parser' )
50
49
links = []
50
+ seen = set ()
51
51
for item in soup .select ('.titleline > a' ):
52
52
url = item ['href' ]
53
53
title = item .text .strip ()
54
54
if url .startswith ('item?id=' ): # Handle internal HN links
55
55
url = f"https://news.ycombinator.com/{ url } "
56
- links .append ({'url' : url , 'text' : title })
56
+ # Avoid duplicates and empty titles
57
+ if url not in seen and title :
58
+ links .append ({'url' : url , 'text' : title })
59
+ seen .add (url )
60
+ if len (links ) >= 5 :
61
+ break
57
62
print (f"Extracted { len (links )} links from Hacker News." )
58
- return links [: 5 ] # Limit to top 5 stories
63
+ return links
59
64
60
65
def extract_github_trending (html ):
61
66
"""Extracts trending repositories from GitHub."""
62
67
soup = BeautifulSoup (html , 'html.parser' )
63
68
links = []
64
- for repo in soup .select ('article.Box-row h1 a' ):
69
+ for repo in soup .select ('article.Box-row h2 a' ):
65
70
url = f"https://github.com{ repo ['href' ]} "
66
- title = repo .text .strip ().replace ('\n ' , '' ).replace (' ' , '' )
67
- links .append ({'url' : url , 'text' : title })
71
+ # Clean up repo name: remove extra whitespace and newlines
72
+ title = re .sub (r'\s+' , ' ' , repo .text ).strip ()
73
+ if title and url :
74
+ links .append ({'url' : url , 'text' : title })
75
+ if len (links ) >= 5 :
76
+ break
68
77
print (f"Extracted { len (links )} trending repositories from GitHub." )
69
- return links [: 5 ] # Limit to top 5 repos
78
+ return links
70
79
71
80
def extract_nytimes_links (html ):
72
81
"""Extracts links from the main page of cn.nytimes.com."""
73
82
soup = BeautifulSoup (html , 'html.parser' )
74
83
links = []
84
+ seen = set ()
85
+ # NYTimes mobile Chinese site: look for news article links in <section> or <article>
75
86
for a in soup .find_all ('a' , href = True ):
76
87
url = a ['href' ]
77
- if url .startswith ('https://m.cn.nytimes.com/' ):
78
- links .append ({
79
- 'url' : url ,
80
- 'text' : a .text .strip ()
81
- })
88
+ text = a .get_text (strip = True )
89
+ # Only keep links that look like news articles and have non-empty text
90
+ if url .startswith ('https://m.cn.nytimes.com/' ) and text and url not in seen :
91
+ links .append ({'url' : url , 'text' : text })
92
+ seen .add (url )
93
+ if len (links ) >= 5 :
94
+ break
82
95
print (f"Extracted { len (links )} links from NYTimes." )
83
- return links [: 5 ] # Limit to top 5 articles
96
+ return links
84
97
85
98
def generate_markdown_report (articles , source_name ):
86
99
"""Generates a Markdown report for the given articles."""
87
100
markdown = f"### { source_name } \n \n "
101
+ if not articles :
102
+ markdown += "_No items found._\n \n "
103
+ return markdown
88
104
for article in articles :
89
- markdown += f"- { article ['text' ]} ({ article ['url' ]} )\n "
105
+ # Escape parentheses in text to avoid Markdown link issues
106
+ safe_text = article ['text' ].replace ('(' , '\\ (' ).replace (')' , '\\ )' )
107
+ markdown += f"- [{ safe_text } ]({ article ['url' ]} )\n "
90
108
return markdown + "\n "
91
109
92
110
def main ():
@@ -96,24 +114,27 @@ def main():
96
114
97
115
# Hacker News
98
116
hn_html = fetch_html_content ('https://news.ycombinator.com' )
117
+ hn_links = []
99
118
if hn_html :
100
119
hn_links = extract_hacker_news_links (hn_html )
101
- markdown_report += generate_markdown_report (hn_links , "Hacker News" )
120
+ markdown_report += generate_markdown_report (hn_links , "Hacker News" )
102
121
103
122
# GitHub Trending
104
123
gh_html = fetch_html_content ('https://github.com/trending' )
124
+ gh_links = []
105
125
if gh_html :
106
126
gh_links = extract_github_trending (gh_html )
107
- markdown_report += generate_markdown_report (gh_links , "GitHub Trending" )
127
+ markdown_report += generate_markdown_report (gh_links , "GitHub Trending" )
108
128
109
129
# NYTimes
110
130
nytimes_html = fetch_html_content ('https://m.cn.nytimes.com' )
131
+ nytimes_links = []
111
132
if nytimes_html :
112
133
nytimes_links = extract_nytimes_links (nytimes_html )
113
- markdown_report += generate_markdown_report (nytimes_links , "NYTimes (Chinese)" )
134
+ markdown_report += generate_markdown_report (nytimes_links , "NYTimes (Chinese)" )
114
135
115
- # Send report to Telegram
116
- if markdown_report . strip () != f"# Daily News Summary - { today } " :
136
+ # Only send if at least one section has news
137
+ if any ([ hn_links , gh_links , nytimes_links ]) :
117
138
if send_telegram_message (markdown_report ):
118
139
print ("Daily news report sent to Telegram successfully." )
119
140
sys .exit (0 )
0 commit comments