Skip to content

Commit 38044d4

Browse files
authored
Merge pull request #255 from maheshpec/feature/configure-cache-directory
feat(config): Adding a configurable way of setting the cache directory for constrained environments
2 parents 8c22396 + 00026b5 commit 38044d4

10 files changed

+14
-14
lines changed

crawl4ai/async_crawler_strategy.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
525525

526526
if self.use_cached_html:
527527
cache_file_path = os.path.join(
528-
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
528+
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
529529
)
530530
if os.path.exists(cache_file_path):
531531
html = ""
@@ -725,7 +725,7 @@ async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
725725

726726
if self.use_cached_html:
727727
cache_file_path = os.path.join(
728-
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
728+
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
729729
)
730730
with open(cache_file_path, "w", encoding="utf-8") as f:
731731
f.write(html)

crawl4ai/async_database.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
logging.basicConfig(level=logging.INFO)
1111
logger = logging.getLogger(__name__)
1212

13-
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
13+
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
1414
os.makedirs(DB_PATH, exist_ok=True)
1515
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
1616

crawl4ai/async_webcrawler.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@ def __init__(
2323
self,
2424
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
2525
always_by_pass_cache: bool = False,
26-
base_directory: str = str(Path.home()),
26+
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
2727
**kwargs,
2828
):
2929
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
3030
**kwargs
3131
)
3232
self.always_by_pass_cache = always_by_pass_cache
33-
# self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
33+
# self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
3434
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
3535
os.makedirs(self.crawl4ai_folder, exist_ok=True)
3636
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)

crawl4ai/crawler_strategy.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def __init__(self, use_cached_html=False, js_code=None, **kwargs):
132132

133133
# chromedriver_autoinstaller.install()
134134
# import chromedriver_autoinstaller
135-
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
135+
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
136136
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
137137
# chromedriver_path = chromedriver_autoinstaller.install()
138138
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
@@ -205,7 +205,7 @@ def crawl(self, url: str, **kwargs) -> str:
205205
url_hash = hashlib.md5(url.encode()).hexdigest()
206206

207207
if self.use_cached_html:
208-
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
208+
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
209209
if os.path.exists(cache_file_path):
210210
with open(cache_file_path, "r") as f:
211211
return sanitize_input_encode(f.read())
@@ -275,7 +275,7 @@ def crawl(self, url: str, **kwargs) -> str:
275275
self.driver = self.execute_hook('before_return_html', self.driver, html)
276276

277277
# Store in cache
278-
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
278+
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
279279
with open(cache_file_path, "w", encoding="utf-8") as f:
280280
f.write(html)
281281

crawl4ai/database.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sqlite3
44
from typing import Optional, Tuple
55

6-
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
6+
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
77
os.makedirs(DB_PATH, exist_ok=True)
88
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
99

crawl4ai/model_loader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def set_model_device(model):
5656

5757
@lru_cache()
5858
def get_home_folder():
59-
home_folder = os.path.join(Path.home(), ".crawl4ai")
59+
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
6060
os.makedirs(home_folder, exist_ok=True)
6161
os.makedirs(f"{home_folder}/cache", exist_ok=True)
6262
os.makedirs(f"{home_folder}/models", exist_ok=True)

crawl4ai/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ class MEMORYSTATUSEX(ctypes.Structure):
6060
raise OSError("Unsupported operating system")
6161

6262
def get_home_folder():
63-
home_folder = os.path.join(Path.home(), ".crawl4ai")
63+
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
6464
os.makedirs(home_folder, exist_ok=True)
6565
os.makedirs(f"{home_folder}/cache", exist_ok=True)
6666
os.makedirs(f"{home_folder}/models", exist_ok=True)

crawl4ai/web_crawler.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class WebCrawler:
2020
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
2121
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
2222
self.always_by_pass_cache = always_by_pass_cache
23-
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
23+
self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
2424
os.makedirs(self.crawl4ai_folder, exist_ok=True)
2525
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
2626
init_db()

docs/md_v2/api/async-webcrawler.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ AsyncWebCrawler(
1313

1414
# Cache Settings
1515
always_by_pass_cache: bool = False, # Always bypass cache
16-
base_directory: str = str(Path.home()), # Base directory for cache
16+
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
1717

1818
# Network Settings
1919
proxy: str = None, # Simple proxy URL

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
1010
# If the folder already exists, remove the cache folder
11-
crawl4ai_folder = Path.home() / ".crawl4ai"
11+
crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
1212
cache_folder = crawl4ai_folder / "cache"
1313

1414
if cache_folder.exists():

0 commit comments

Comments
 (0)