-
-
Notifications
You must be signed in to change notification settings - Fork 19.6k
/
Copy pathFireCrawl-TEST.ts
147 lines (136 loc) · 4.75 KB
/
FireCrawl-TEST.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'
import { FireCrawlLoader } from './FireCrawl'
async function testFireCrawl() {
const apiKey = process.env.FIRECRAW_API_KEY || 'FIRECRAW_API_KEY'
const apiUrl = 'https://api.firecrawl.dev'
// Test URLs
const testUrl = 'https://firecrawl.dev/'
const testUrlForExtract = 'https://firecrawl.dev/*'
// Test 1: Basic Scraping
console.log('\n=== Testing Basic Scraping ===')
try {
const scrapeLoader = new FireCrawlLoader({
url: testUrl,
apiKey,
apiUrl,
mode: 'scrape',
params: {
onlyMainContent: true,
includeTags: ['article', 'main', 'section'],
excludeTags: ['header', 'footer', 'nav']
}
})
const scrapeDocs = await scrapeLoader.load()
console.log('Scrape Results:', {
numDocs: scrapeDocs.length,
firstDocMetadata: scrapeDocs[0]?.metadata,
firstDocContent: scrapeDocs[0]?.pageContent.substring(0, 100) + '...'
})
} catch (error) {
console.error('Scraping Error:', error)
}
// Test 2: Crawling with Text Splitter
console.log('\n=== Testing Crawling with Text Splitter ===')
try {
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200
})
const crawlLoader = new FireCrawlLoader({
url: testUrl,
apiKey,
apiUrl,
mode: 'crawl',
params: {
limit: 5, // Limit to 5 pages for testing
includePaths: ['/docs', '/blog'],
excludePaths: ['/api', '/admin']
}
})
console.log('Starting crawl with params:', {
url: testUrl,
apiKey: apiKey.substring(0, 8) + '...',
apiUrl,
mode: 'crawl'
})
const crawlDocs = await crawlLoader.load()
if (!crawlDocs || crawlDocs.length === 0) {
console.warn('No documents were returned from the crawl')
return
}
console.log('Crawl Results:', {
numDocs: crawlDocs.length,
firstDocMetadata: crawlDocs[0]?.metadata,
firstDocContent: crawlDocs[0]?.pageContent.substring(0, 100) + '...'
})
} catch (error: any) {
console.error('Crawling Error Details:', {
message: error.message,
stack: error.stack,
response: error.response?.data,
status: error.response?.status
})
}
// Test 3: Data Extraction
console.log('\n=== Testing Data Extraction ===')
try {
const extractLoader = new FireCrawlLoader({
url: testUrlForExtract,
apiKey,
apiUrl,
mode: 'extract',
params: {
schema: {
type: 'object',
properties: {
company: {
type: 'object',
properties: {
name: {
type: 'string'
},
mission: {
type: 'string'
},
is_open_source: {
type: 'boolean'
}
},
required: ['name']
}
},
required: ['company']
},
prompt: 'Extract the company name, mission, and determine if the company is open source.'
}
})
const extractDocs = await extractLoader.load()
console.log('Extract Results:', {
numDocs: extractDocs.length,
firstDocMetadata: extractDocs[0]?.metadata,
firstDocContent: extractDocs[0]?.pageContent
})
} catch (error) {
console.error('Extraction Error:', error)
}
// // Test 4: Get Extract Status
console.log('\n=== Testing Get Extract Status ===')
try {
const statusLoader = new FireCrawlLoader({
url: testUrl,
apiKey,
apiUrl,
mode: 'getExtractStatus',
params: { jobId: 'EXTRACT_JOB_ID' } // Replace with an actual job ID
})
const statusResult = await statusLoader.load()
console.log('Status Results:', statusResult)
} catch (error) {
console.error('Status Check Error:', error)
}
}
// Run the tests
testFireCrawl().catch((error) => {
console.error('Fatal error:', error)
process.exit(1)
})