Skip to content

Commit 6eed4ad

Browse files
committed
Merge branch 'vr0.5.0.post5'
2 parents 79328e4 + bdd9db5 commit 6eed4ad

File tree

2 files changed

+21
-9
lines changed

2 files changed

+21
-9
lines changed

crawl4ai/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# crawl4ai/_version.py
2-
__version__ = "0.5.0.post4"
2+
__version__ = "0.5.0.post6"

crawl4ai/cli.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
BM25ContentFilter,
2525
PruningContentFilter,
2626
BrowserProfiler,
27+
DefaultMarkdownGenerator,
2728
LLMConfig
2829
)
2930
from litellm import completion
@@ -614,17 +615,28 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
614615
crawler_cfg = crawler_cfg.clone(**crawler)
615616

616617
# Handle content filter config
617-
if filter_config:
618-
filter_conf = load_config_file(filter_config)
618+
if filter_config or output in ["markdown-fit", "md-fit"]:
619+
if filter_config:
620+
filter_conf = load_config_file(filter_config)
621+
elif not filter_config and output in ["markdown-fit", "md-fit"]:
622+
filter_conf = {
623+
"type": "pruning",
624+
"query": "",
625+
"threshold": 0.48
626+
}
619627
if filter_conf["type"] == "bm25":
620-
crawler_cfg.content_filter = BM25ContentFilter(
621-
user_query=filter_conf.get("query"),
622-
bm25_threshold=filter_conf.get("threshold", 1.0)
628+
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
629+
content_filter = BM25ContentFilter(
630+
user_query=filter_conf.get("query"),
631+
bm25_threshold=filter_conf.get("threshold", 1.0)
632+
)
623633
)
624634
elif filter_conf["type"] == "pruning":
625-
crawler_cfg.content_filter = PruningContentFilter(
626-
user_query=filter_conf.get("query"),
627-
threshold=filter_conf.get("threshold", 0.48)
635+
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
636+
content_filter = PruningContentFilter(
637+
user_query=filter_conf.get("query"),
638+
threshold=filter_conf.get("threshold", 0.48)
639+
)
628640
)
629641

630642
# Handle extraction strategy

0 commit comments

Comments
 (0)