|
24 | 24 | BM25ContentFilter,
|
25 | 25 | PruningContentFilter,
|
26 | 26 | BrowserProfiler,
|
| 27 | + DefaultMarkdownGenerator, |
27 | 28 | LLMConfig
|
28 | 29 | )
|
29 | 30 | from litellm import completion
|
@@ -614,17 +615,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
|
614 | 615 | crawler_cfg = crawler_cfg.clone(**crawler)
|
615 | 616 |
|
616 | 617 | # Handle content filter config
|
617 |
| - if filter_config: |
618 |
| - filter_conf = load_config_file(filter_config) |
| 618 | + if filter_config or output in ["markdown-fit", "md-fit"]: |
| 619 | + if filter_config: |
| 620 | + filter_conf = load_config_file(filter_config) |
| 621 | + elif not filter_config and output in ["markdown-fit", "md-fit"]: |
| 622 | + filter_conf = { |
| 623 | + "type": "pruning", |
| 624 | + "query": "", |
| 625 | + "threshold": 0.48 |
| 626 | + } |
619 | 627 | if filter_conf["type"] == "bm25":
|
620 |
| - crawler_cfg.content_filter = BM25ContentFilter( |
621 |
| - user_query=filter_conf.get("query"), |
622 |
| - bm25_threshold=filter_conf.get("threshold", 1.0) |
| 628 | + crawler_cfg.markdown_generator = DefaultMarkdownGenerator( |
| 629 | + content_filter = BM25ContentFilter( |
| 630 | + user_query=filter_conf.get("query"), |
| 631 | + bm25_threshold=filter_conf.get("threshold", 1.0) |
| 632 | + ) |
623 | 633 | )
|
624 | 634 | elif filter_conf["type"] == "pruning":
|
625 |
| - crawler_cfg.content_filter = PruningContentFilter( |
626 |
| - user_query=filter_conf.get("query"), |
627 |
| - threshold=filter_conf.get("threshold", 0.48) |
| 635 | + crawler_cfg.markdown_generator = DefaultMarkdownGenerator( |
| 636 | + content_filter = PruningContentFilter( |
| 637 | + user_query=filter_conf.get("query"), |
| 638 | + threshold=filter_conf.get("threshold", 0.48) |
| 639 | + ) |
628 | 640 | )
|
629 | 641 |
|
630 | 642 | # Handle extraction strategy
|
|
0 commit comments