1
- import asyncio
2
1
import re
3
2
from typing import Callable , Self
4
3
from urllib .parse import urljoin
11
10
import raggy
12
11
from raggy .documents import Document , document_to_excerpts
13
12
from raggy .loaders .base import Loader , MultiLoader
13
+ from raggy .utilities .asyncutils import run_concurrent_tasks
14
14
from raggy .utilities .collections import batched
15
15
16
16
user_agent = UserAgent ()
17
17
18
- URL_CONCURRENCY = asyncio .Semaphore (30 )
19
-
20
18
21
19
def ensure_http (url ):
22
20
if not url .startswith (("http://" , "https://" )):
@@ -30,7 +28,6 @@ async def sitemap_search(sitemap_url) -> list[str]:
30
28
response .raise_for_status ()
31
29
32
30
soup = BeautifulSoup (response .content , "xml" )
33
-
34
31
return [loc .text for loc in soup .find_all ("loc" )]
35
32
36
33
@@ -51,28 +48,33 @@ class URLLoader(WebLoader):
51
48
"""
52
49
53
50
source_type : str = "url"
54
-
55
51
urls : list [str ] = Field (default_factory = list )
56
52
57
53
async def load (self ) -> list [Document ]:
58
54
headers = await self .get_headers ()
59
55
async with AsyncClient (
60
56
headers = headers , timeout = 30 , follow_redirects = True
61
57
) as client :
62
- documents = await asyncio .gather (
63
- * [self .load_url (u , client ) for u in self .urls ], return_exceptions = True
58
+
59
+ async def load_url_task (url ):
60
+ try :
61
+ return await self .load_url (url , client )
62
+ except Exception as e :
63
+ self .logger .error (e )
64
+ return None
65
+
66
+ documents = await run_concurrent_tasks (
67
+ [lambda u = url : load_url_task (u ) for url in self .urls ], max_concurrent = 30
64
68
)
69
+
65
70
final_documents = []
66
71
for d in documents :
67
- if isinstance (d , Exception ):
68
- self .logger .error (d )
69
- elif d is not None :
70
- final_documents .extend (await document_to_excerpts (d )) # type: ignore
72
+ if d is not None :
73
+ final_documents .extend (await document_to_excerpts (d ))
71
74
return final_documents
72
75
73
76
async def load_url (self , url , client ) -> Document | None :
74
- async with URL_CONCURRENCY :
75
- response = await client .get (url , follow_redirects = True )
77
+ response = await client .get (url , follow_redirects = True )
76
78
77
79
if not response .status_code == 200 :
78
80
self .logger .warning_style (
@@ -84,16 +86,17 @@ async def load_url(self, url, client) -> Document | None:
84
86
meta_refresh = soup .find (
85
87
"meta" , attrs = {"http-equiv" : re .compile (r"refresh" , re .I )}
86
88
)
87
- if meta_refresh :
88
- refresh_content = meta_refresh .get ("content" )
89
- redirect_url_match = re .search (r"url=([\S]+)" , refresh_content , re .I )
90
- if redirect_url_match :
91
- redirect_url = redirect_url_match .group (1 )
92
- # join base url with relative url
93
- redirect_url = urljoin (str (response .url ), redirect_url )
94
- # Now ensure the URL includes the protocol
95
- redirect_url = ensure_http (redirect_url )
96
- response = await client .get (redirect_url , follow_redirects = True )
89
+ if meta_refresh and isinstance (meta_refresh , BeautifulSoup .Tag ):
90
+ content = meta_refresh .get ("content" , "" )
91
+ if isinstance (content , str ):
92
+ redirect_url_match = re .search (r"url=([\S]+)" , content , re .I )
93
+ if redirect_url_match :
94
+ redirect_url = redirect_url_match .group (1 )
95
+ # join base url with relative url
96
+ redirect_url = urljoin (str (response .url ), redirect_url )
97
+ # Now ensure the URL includes the protocol
98
+ redirect_url = ensure_http (redirect_url )
99
+ response = await client .get (redirect_url , follow_redirects = True )
97
100
98
101
document = await self .response_to_document (response )
99
102
if document :
@@ -103,6 +106,7 @@ async def load_url(self, url, client) -> Document | None:
103
106
return document
104
107
105
108
async def response_to_document (self , response : Response ) -> Document :
109
+ """Convert an HTTP response to a Document."""
106
110
return Document (
107
111
text = await self .get_document_text (response ),
108
112
metadata = dict (
@@ -128,17 +132,15 @@ async def get_document_text(self, response: Response) -> str:
128
132
129
133
class SitemapLoader (URLLoader ):
130
134
"""A loader that loads URLs from a sitemap.
131
-
132
135
Attributes:
133
136
include: A list of strings or regular expressions. Only URLs that match one of these will be included.
134
137
exclude: A list of strings or regular expressions. URLs that match one of these will be excluded.
135
138
url_loader: The loader to use for loading the URLs.
136
-
137
139
Examples:
138
140
Load all URLs from a sitemap:
139
141
```python
140
142
from raggy.loaders.web import SitemapLoader
141
- loader = SitemapLoader(urls=["https://askmarvin .ai/sitemap.xml"])
143
+ loader = SitemapLoader(urls=["https://controlflow .ai/sitemap.xml"])
142
144
documents = await loader.load()
143
145
print(documents)
144
146
```
@@ -147,11 +149,12 @@ class SitemapLoader(URLLoader):
147
149
include : list [str | re .Pattern ] = Field (default_factory = list )
148
150
exclude : list [str | re .Pattern ] = Field (default_factory = list )
149
151
url_loader : URLLoader = Field (default_factory = HTMLLoader )
150
-
151
152
url_processor : Callable [[str ], str ] = lambda x : x # noqa: E731
152
153
153
154
async def _get_loader (self : Self ) -> MultiLoader :
154
- urls = await asyncio .gather (* [self .load_sitemap (url ) for url in self .urls ])
155
+ urls = await run_concurrent_tasks (
156
+ [lambda u = url : self .load_sitemap (u ) for url in self .urls ], max_concurrent = 5
157
+ )
155
158
return MultiLoader (
156
159
loaders = [
157
160
type (self .url_loader )(urls = url_batch , headers = await self .get_headers ()) # type: ignore
@@ -169,7 +172,6 @@ async def load_sitemap(self, url: str) -> list[str]:
169
172
def is_included (url : str ) -> bool :
170
173
if not self .include :
171
174
return True
172
-
173
175
return any (
174
176
(isinstance (i , str ) and i in url )
175
177
or (isinstance (i , re .Pattern ) and re .search (i , url ))
0 commit comments