Skip to content

Commit dafbe6d

Browse files
gremidadbar
andauthored
CLI: print URLs early for feeds and sitemaps with --list (#744)
* cli: also stream URL list gathered from feeds * make streaming default and add threading.RLock --------- Co-authored-by: Adrien Barbaresi <[email protected]>
1 parent 2274ceb commit dafbe6d

File tree

1 file changed

+7
-8
lines changed

1 file changed

+7
-8
lines changed

trafilatura/cli_utils.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from datetime import datetime
2121
from functools import partial
2222
from os import makedirs, path, stat, walk
23+
from threading import RLock
2324
from typing import Any, Generator, Optional, List, Set, Tuple
2425

2526
from courlan import UrlStore, extract_domain, get_base_url # validate_url
@@ -301,6 +302,7 @@ def cli_discovery(args: Any) -> None:
301302
external=options.config.getboolean("DEFAULT", "EXTERNAL_URLS"),
302303
sleep_time=options.config.getfloat("DEFAULT", "SLEEP_TIME"),
303304
)
305+
lock = RLock()
304306

305307
# link discovery and storage
306308
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
@@ -311,14 +313,11 @@ def cli_discovery(args: Any) -> None:
311313
if future.result() is not None:
312314
url_store.add_urls(future.result())
313315
# empty buffer in order to spare memory
314-
if (
315-
args.sitemap
316-
and args.list
317-
and len(url_store.get_known_domains()) >= args.parallel
318-
):
319-
url_store.print_unvisited_urls()
320-
url_store.reset()
321-
reset_caches()
316+
if args.list and len(url_store.get_known_domains()) >= args.parallel:
317+
with lock:
318+
url_store.print_unvisited_urls()
319+
url_store.reset()
320+
reset_caches()
322321

323322
# process the (rest of the) links found
324323
error_caught = url_processing_pipeline(args, url_store)

0 commit comments

Comments
 (0)