Skip to content

Commit 2b19e84

Browse files
lablnetitsvinayakcclauss
authored
Create emails_from_url.py (TheAlgorithms#1756)
* Create emails_from_url.py * Update emails_from_url.py * Update emails_from_url.py * 0 emails found: * Update emails_from_url.py * Use Python set() to remove duplicates * Update emails_from_url.py * Add type hints and doctests Co-authored-by: vinayak <[email protected]> Co-authored-by: Christian Clauss <[email protected]>
1 parent c1a4cc9 commit 2b19e84

File tree

1 file changed

+105
-0
lines changed

1 file changed

+105
-0
lines changed

web_programming/emails_from_url.py

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
"""Get the site emails from URL."""
2+
__author__ = "Muhammad Umer Farooq"
3+
__license__ = "MIT"
4+
__version__ = "1.0.0"
5+
__maintainer__ = "Muhammad Umer Farooq"
6+
__email__ = "[email protected]"
7+
__status__ = "Alpha"
8+
9+
import re
10+
from html.parser import HTMLParser
11+
from urllib import parse
12+
13+
import requests
14+
15+
16+
class Parser(HTMLParser):
17+
def __init__(self, domain: str):
18+
HTMLParser.__init__(self)
19+
self.data = []
20+
self.domain = domain
21+
22+
def handle_starttag(self, tag: str, attrs: str) -> None:
23+
"""
24+
This function parse html to take takes url from tags
25+
"""
26+
# Only parse the 'anchor' tag.
27+
if tag == "a":
28+
# Check the list of defined attributes.
29+
for name, value in attrs:
30+
# If href is defined, and not empty nor # print it.
31+
if name == "href" and value != "#" and value != "":
32+
# If not already in data.
33+
if value not in self.data:
34+
url = parse.urljoin(self.domain, value)
35+
self.data.append(url)
36+
37+
38+
# Get main domain name (example.com)
39+
def get_domain_name(url: str) -> str:
40+
"""
41+
This function get the main domain name
42+
43+
>>> get_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
44+
'c.d'
45+
>>> get_domain_name("Not a URL!")
46+
''
47+
"""
48+
return ".".join(get_sub_domain_name(url).split(".")[-2:])
49+
50+
51+
# Get sub domain name (sub.example.com)
52+
def get_sub_domain_name(url: str) -> str:
53+
"""
54+
This function get sub domin name
55+
56+
>>> get_sub_domain_name("https://a.b.c.d/e/f?g=h,i=j#k")
57+
'a.b.c.d'
58+
>>> get_sub_domain_name("Not a URL!")
59+
''
60+
"""
61+
return parse.urlparse(url).netloc
62+
63+
64+
def emails_from_url(url: str = "https://github.com") -> list:
65+
"""
66+
This function takes url and return all valid urls
67+
"""
68+
# Get the base domain from the url
69+
domain = get_domain_name(url)
70+
71+
# Initialize the parser
72+
parser = Parser(domain)
73+
74+
try:
75+
# Open URL
76+
r = requests.get(url)
77+
78+
# pass the raw HTML to the parser to get links
79+
parser.feed(r.text)
80+
81+
# Get links and loop through
82+
valid_emails = set()
83+
for link in parser.data:
84+
# open URL.
85+
# read = requests.get(link)
86+
try:
87+
read = requests.get(link)
88+
# Get the valid email.
89+
emails = re.findall("[a-zA-Z0-9]+@" + domain, read.text)
90+
# If not in list then append it.
91+
for email in emails:
92+
valid_emails.add(email)
93+
except ValueError:
94+
pass
95+
except ValueError:
96+
exit(-1)
97+
98+
# Finally return a sorted list of email addresses with no duplicates.
99+
return sorted(valid_emails)
100+
101+
102+
if __name__ == "__main__":
103+
emails = emails_from_url("https://github.com")
104+
print(f"{len(emails)} emails found:")
105+
print("\n".join(sorted(emails)))

0 commit comments

Comments
 (0)