@@ -363,106 +363,115 @@ async def get_crawl_queue(self, crawl_id, offset, count, regex):
363
363
364
364
total = 0
365
365
results = []
366
- redis = None
367
366
368
367
try :
369
- redis = await self .get_redis (crawl_id )
368
+ async with self .get_redis (crawl_id ) as redis :
369
+ total = await self ._crawl_queue_len (redis , f"{ crawl_id } :q" )
370
+ results = await self ._crawl_queue_range (
371
+ redis , f"{ crawl_id } :q" , offset , count
372
+ )
373
+ results = [json .loads (result )["url" ] for result in results ]
370
374
371
- total = await self ._crawl_queue_len (redis , f"{ crawl_id } :q" )
372
- results = await self ._crawl_queue_range (
373
- redis , f"{ crawl_id } :q" , offset , count
374
- )
375
- results = [json .loads (result )["url" ] for result in results ]
376
375
except exceptions .ConnectionError :
377
376
# can't connect to redis, likely not initialized yet
378
377
pass
379
378
380
379
matched = []
381
380
if regex :
382
- regex = re .compile (regex )
381
+ try :
382
+ regex = re .compile (regex )
383
+ except re .error as exc :
384
+ raise HTTPException (status_code = 400 , detail = "invalid_regex" ) from exc
385
+
383
386
matched = [result for result in results if regex .search (result )]
384
387
385
388
return {"total" : total , "results" : results , "matched" : matched }
386
389
387
390
async def match_crawl_queue (self , crawl_id , regex ):
388
391
"""get list of urls that match regex"""
389
392
total = 0
390
- redis = None
391
-
392
- try :
393
- redis = await self .get_redis (crawl_id )
394
- total = await self ._crawl_queue_len (redis , f"{ crawl_id } :q" )
395
- except exceptions .ConnectionError :
396
- # can't connect to redis, likely not initialized yet
397
- pass
398
-
399
- regex = re .compile (regex )
400
393
matched = []
401
394
step = 50
402
395
403
- for count in range (0 , total , step ):
404
- results = await self ._crawl_queue_range (redis , f"{ crawl_id } :q" , count , step )
405
- for result in results :
406
- url = json .loads (result )["url" ]
407
- if regex .search (url ):
408
- matched .append (url )
396
+ async with self .get_redis (crawl_id ) as redis :
397
+ try :
398
+ total = await self ._crawl_queue_len (redis , f"{ crawl_id } :q" )
399
+ except exceptions .ConnectionError :
400
+ # can't connect to redis, likely not initialized yet
401
+ pass
402
+
403
+ try :
404
+ regex = re .compile (regex )
405
+ except re .error as exc :
406
+ raise HTTPException (status_code = 400 , detail = "invalid_regex" ) from exc
407
+
408
+ for count in range (0 , total , step ):
409
+ results = await self ._crawl_queue_range (
410
+ redis , f"{ crawl_id } :q" , count , step
411
+ )
412
+ for result in results :
413
+ url = json .loads (result )["url" ]
414
+ if regex .search (url ):
415
+ matched .append (url )
409
416
410
417
return {"total" : total , "matched" : matched }
411
418
412
419
async def filter_crawl_queue (self , crawl_id , regex ):
413
420
"""filter out urls that match regex"""
414
421
# pylint: disable=too-many-locals
415
422
total = 0
416
- redis = None
417
-
418
423
q_key = f"{ crawl_id } :q"
419
424
s_key = f"{ crawl_id } :s"
420
-
421
- try :
422
- redis = await self .get_redis (crawl_id )
423
- total = await self ._crawl_queue_len (redis , f"{ crawl_id } :q" )
424
- except exceptions .ConnectionError :
425
- # can't connect to redis, likely not initialized yet
426
- pass
427
-
428
- dircount = - 1
429
- regex = re .compile (regex )
430
425
step = 50
431
-
432
- count = 0
433
426
num_removed = 0
434
427
435
- # pylint: disable=fixme
436
- # todo: do this in a more efficient way?
437
- # currently quite inefficient as redis does not have a way
438
- # to atomically check and remove value from list
439
- # so removing each jsob block by value
440
- while count < total :
441
- if dircount == - 1 and count > total / 2 :
442
- dircount = 1
443
- results = await self ._crawl_queue_range (redis , q_key , count , step )
444
- count += step
445
-
446
- qrems = []
447
- srems = []
448
-
449
- for result in results :
450
- url = json .loads (result )["url" ]
451
- if regex .search (url ):
452
- srems .append (url )
453
- # await redis.srem(s_key, url)
454
- # res = await self._crawl_queue_rem(redis, q_key, result, dircount)
455
- qrems .append (result )
456
-
457
- if not srems :
458
- continue
459
-
460
- await redis .srem (s_key , * srems )
461
- res = await self ._crawl_queue_rem (redis , q_key , qrems , dircount )
462
- if res :
463
- count -= res
464
- num_removed += res
465
- print (f"Removed { res } from queue" , flush = True )
428
+ async with self .get_redis (crawl_id ) as redis :
429
+ try :
430
+ total = await self ._crawl_queue_len (redis , f"{ crawl_id } :q" )
431
+ except exceptions .ConnectionError :
432
+ # can't connect to redis, likely not initialized yet
433
+ pass
434
+
435
+ dircount = - 1
436
+
437
+ try :
438
+ regex = re .compile (regex )
439
+ except re .error as exc :
440
+ raise HTTPException (status_code = 400 , detail = "invalid_regex" ) from exc
441
+
442
+ count = 0
443
+
444
+ # pylint: disable=fixme
445
+ # todo: do this in a more efficient way?
446
+ # currently quite inefficient as redis does not have a way
447
+ # to atomically check and remove value from list
448
+ # so removing each jsob block by value
449
+ while count < total :
450
+ if dircount == - 1 and count > total / 2 :
451
+ dircount = 1
452
+ results = await self ._crawl_queue_range (redis , q_key , count , step )
453
+ count += step
454
+
455
+ qrems = []
456
+ srems = []
457
+
458
+ for result in results :
459
+ url = json .loads (result )["url" ]
460
+ if regex .search (url ):
461
+ srems .append (url )
462
+ # await redis.srem(s_key, url)
463
+ # res = await self._crawl_queue_rem(redis, q_key, result, dircount)
464
+ qrems .append (result )
465
+
466
+ if not srems :
467
+ continue
468
+
469
+ await redis .srem (s_key , * srems )
470
+ res = await self ._crawl_queue_rem (redis , q_key , qrems , dircount )
471
+ if res :
472
+ count -= res
473
+ num_removed += res
474
+ print (f"Removed { res } from queue" , flush = True )
466
475
467
476
return num_removed
468
477
@@ -475,13 +484,13 @@ async def get_errors_from_redis(
475
484
skip = page * page_size
476
485
upper_bound = skip + page_size - 1
477
486
478
- try :
479
- redis = await self . get_redis ( crawl_id )
480
- errors = await redis .lrange (f"{ crawl_id } :e" , skip , upper_bound )
481
- total = await redis .llen (f"{ crawl_id } :e" )
482
- except exceptions .ConnectionError :
483
- # pylint: disable=raise-missing-from
484
- raise HTTPException (status_code = 503 , detail = "redis_connection_error" )
487
+ async with self . get_redis ( crawl_id ) as redis :
488
+ try :
489
+ errors = await redis .lrange (f"{ crawl_id } :e" , skip , upper_bound )
490
+ total = await redis .llen (f"{ crawl_id } :e" )
491
+ except exceptions .ConnectionError :
492
+ # pylint: disable=raise-missing-from
493
+ raise HTTPException (status_code = 503 , detail = "redis_connection_error" )
485
494
486
495
parsed_errors = parse_jsonl_error_messages (errors )
487
496
return parsed_errors , total
0 commit comments