diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts new file mode 100644 index 00000000000..651728223ab --- /dev/null +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl-TEST.ts @@ -0,0 +1,147 @@ +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter' +import { FireCrawlLoader } from './FireCrawl' + +async function testFireCrawl() { + const apiKey = process.env.FIRECRAW_API_KEY || 'FIRECRAW_API_KEY' + const apiUrl = 'https://api.firecrawl.dev' + + // Test URLs + const testUrl = 'https://firecrawl.dev/' + const testUrlForExtract = 'https://firecrawl.dev/*' + + // Test 1: Basic Scraping + console.log('\n=== Testing Basic Scraping ===') + try { + const scrapeLoader = new FireCrawlLoader({ + url: testUrl, + apiKey, + apiUrl, + mode: 'scrape', + params: { + onlyMainContent: true, + includeTags: ['article', 'main', 'section'], + excludeTags: ['header', 'footer', 'nav'] + } + }) + const scrapeDocs = await scrapeLoader.load() + console.log('Scrape Results:', { + numDocs: scrapeDocs.length, + firstDocMetadata: scrapeDocs[0]?.metadata, + firstDocContent: scrapeDocs[0]?.pageContent.substring(0, 100) + '...' + }) + } catch (error) { + console.error('Scraping Error:', error) + } + + // Test 2: Crawling with Text Splitter + console.log('\n=== Testing Crawling with Text Splitter ===') + try { + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 200 + }) + const crawlLoader = new FireCrawlLoader({ + url: testUrl, + apiKey, + apiUrl, + mode: 'crawl', + params: { + limit: 5, // Limit to 5 pages for testing + includePaths: ['/docs', '/blog'], + excludePaths: ['/api', '/admin'] + } + }) + + console.log('Starting crawl with params:', { + url: testUrl, + apiKey: apiKey.substring(0, 8) + '...', + apiUrl, + mode: 'crawl' + }) + + const crawlDocs = await crawlLoader.load() + + if (!crawlDocs || crawlDocs.length === 0) { + console.warn('No documents were returned from the crawl') + return + } + + console.log('Crawl Results:', { + numDocs: crawlDocs.length, + firstDocMetadata: crawlDocs[0]?.metadata, + firstDocContent: crawlDocs[0]?.pageContent.substring(0, 100) + '...' + }) + } catch (error: any) { + console.error('Crawling Error Details:', { + message: error.message, + stack: error.stack, + response: error.response?.data, + status: error.response?.status + }) + } + + // Test 3: Data Extraction + console.log('\n=== Testing Data Extraction ===') + try { + const extractLoader = new FireCrawlLoader({ + url: testUrlForExtract, + apiKey, + apiUrl, + mode: 'extract', + params: { + schema: { + type: 'object', + properties: { + company: { + type: 'object', + properties: { + name: { + type: 'string' + }, + mission: { + type: 'string' + }, + is_open_source: { + type: 'boolean' + } + }, + required: ['name'] + } + }, + required: ['company'] + }, + prompt: 'Extract the company name, mission, and determine if the company is open source.' + } + }) + const extractDocs = await extractLoader.load() + console.log('Extract Results:', { + numDocs: extractDocs.length, + firstDocMetadata: extractDocs[0]?.metadata, + firstDocContent: extractDocs[0]?.pageContent + }) + } catch (error) { + console.error('Extraction Error:', error) + } + + // // Test 4: Get Extract Status + console.log('\n=== Testing Get Extract Status ===') + try { + const statusLoader = new FireCrawlLoader({ + url: testUrl, + apiKey, + apiUrl, + mode: 'getExtractStatus', + params: { jobId: 'EXTRACT_JOB_ID' } // Replace with an actual job ID + }) + const statusResult = await statusLoader.load() + console.log('Status Results:', statusResult) + } catch (error) { + console.error('Status Check Error:', error) + } +} + +// Run the tests +testFireCrawl().catch((error) => { + console.error('Fatal error:', error) + process.exit(1) +}) diff --git a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts index a2707d13b07..a72e6017e9a 100644 --- a/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts +++ b/packages/components/nodes/documentloaders/FireCrawl/FireCrawl.ts @@ -17,25 +17,24 @@ interface FirecrawlDocumentMetadata { title?: string description?: string language?: string - // ... (other metadata fields) + sourceURL?: string + statusCode?: number + error?: string [key: string]: any } interface FirecrawlDocument { - id?: string - url?: string - content: string markdown?: string html?: string - llm_extraction?: Record - createdAt?: Date - updatedAt?: Date - type?: string + rawHtml?: string + screenshot?: string + links?: string[] + actions?: { + screenshots?: string[] + } metadata: FirecrawlDocumentMetadata - childrenLinks?: string[] - provider?: string + llm_extraction?: Record warning?: string - index?: number } interface ScrapeResponse { @@ -46,11 +45,28 @@ interface ScrapeResponse { interface CrawlResponse { success: boolean - jobId?: string - data?: FirecrawlDocument[] + id: string + url: string error?: string } +interface CrawlStatusResponse { + status: string + total: number + completed: number + creditsUsed: number + expiresAt: string + next?: string + data?: FirecrawlDocument[] +} + +interface ExtractResponse { + success: boolean + id: string + url: string + data?: Record +} + interface Params { [key: string]: any extractorOptions?: { @@ -60,6 +76,35 @@ interface Params { } } +type Format = 'markdown' | 'html' | 'rawHtml' | 'links' | 'screenshot' | 'screenshot@fullPage' | 'json' + +interface ExtractRequest { + urls: string[] + prompt?: string + schema?: Record + enableWebSearch?: boolean + ignoreSitemap?: boolean + includeSubdomains?: boolean + showSources?: boolean + scrapeOptions?: { + formats: Format[] + onlyMainContent?: boolean + includeTags?: string[] + excludeTags?: string[] + mobile?: boolean + skipTlsVerification?: boolean + timeout?: number + [key: string]: any + } +} + +interface ExtractStatusResponse { + success: boolean + data: any + status: 'completed' | 'pending' | 'processing' | 'failed' | 'cancelled' + expiresAt: string +} + // FirecrawlApp class (not exported) class FirecrawlApp { private apiKey: string @@ -91,7 +136,7 @@ class FirecrawlApp { } } try { - const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v0/scrape', jsonData, headers) + const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/scrape', jsonData, headers) if (response.status === 200) { const responseData = response.data if (responseData.success) { @@ -114,33 +159,83 @@ class FirecrawlApp { waitUntilDone: boolean = true, pollInterval: number = 2, idempotencyKey?: string - ): Promise { + ): Promise { const headers = this.prepareHeaders(idempotencyKey) let jsonData: Params = { url, ...params } try { - const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v0/crawl', jsonData, headers) + const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/crawl', jsonData, headers) if (response.status === 200) { - const jobId: string = response.data.jobId + const crawlResponse = response.data as CrawlResponse + if (!crawlResponse.success) { + throw new Error(`Crawl request failed: ${crawlResponse.error || 'Unknown error'}`) + } + if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval) + return this.monitorJobStatus(crawlResponse.id, headers, pollInterval) } else { - return { success: true, jobId } + return crawlResponse } } else { this.handleError(response, 'start crawl job') } + } catch (error: any) { + if (error.response?.data?.error) { + throw new Error(`Crawl failed: ${error.response.data.error}`) + } + + throw new Error(`Crawl failed: ${error.message}`) + } + + return { success: false, id: '', url: '' } + } + + async extract( + request: ExtractRequest, + waitUntilDone: boolean = true, + pollInterval: number = 2 + ): Promise { + const headers = this.prepareHeaders() + try { + const response: AxiosResponse = await this.postRequest(this.apiUrl + '/v1/extract', request, headers) + if (response.status === 200) { + const extractResponse = response.data as ExtractResponse + if (waitUntilDone) { + return this.monitorExtractStatus(extractResponse.id, headers, pollInterval) + } else { + return extractResponse + } + } else { + this.handleError(response, 'start extract job') + } } catch (error: any) { throw new Error(error.message) } - return { success: false, error: 'Internal server error.' } + return { success: false, id: '', url: '' } + } + + async getExtractStatus(jobId: string): Promise { + const headers = this.prepareHeaders() + try { + const response: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/extract/${jobId}`, headers) + if (response.status === 200) { + return response.data as ExtractStatusResponse + } else { + this.handleError(response, 'get extract status') + } + } catch (error: any) { + throw new Error(error.message) + } + return { success: false, data: null, status: 'failed', expiresAt: '' } } private prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders { return { 'Content-Type': 'application/json', Authorization: `Bearer ${this.apiKey}`, + 'X-Origin': 'flowise', + 'X-Origin-Type': 'integration', ...(idempotencyKey ? { 'x-idempotency-key': idempotencyKey } : {}) - } as AxiosRequestHeaders & { 'x-idempotency-key'?: string } + } as AxiosRequestHeaders & { 'X-Origin': string; 'X-Origin-Type': string; 'x-idempotency-key'?: string } } private postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise { @@ -151,33 +246,58 @@ class FirecrawlApp { return axios.get(url, { headers }) } - private async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise { + private async monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise { let isJobCompleted = false while (!isJobCompleted) { - const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers) + const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/crawl/${jobId}`, headers) if (statusResponse.status === 200) { - const statusData = statusResponse.data + const statusData = statusResponse.data as CrawlStatusResponse switch (statusData.status) { case 'completed': isJobCompleted = true - if ('data' in statusData) { - return statusData.data - } else { - throw new Error('Crawl job completed but no data was returned') + return statusData + case 'scraping': + case 'failed': + if (statusData.status === 'failed') { + throw new Error('Crawl job failed') } - case 'active': - case 'paused': - case 'pending': - case 'queued': await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000)) break default: - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`) + throw new Error(`Unknown crawl status: ${statusData.status}`) } } else { this.handleError(statusResponse, 'check crawl status') } } + throw new Error('Failed to monitor job status') + } + + private async monitorExtractStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise { + let isJobCompleted = false + while (!isJobCompleted) { + const statusResponse: AxiosResponse = await this.getRequest(this.apiUrl + `/v1/extract/${jobId}`, headers) + if (statusResponse.status === 200) { + const statusData = statusResponse.data as ExtractStatusResponse + switch (statusData.status) { + case 'completed': + isJobCompleted = true + return statusData + case 'processing': + case 'failed': + if (statusData.status === 'failed') { + throw new Error('Extract job failed') + } + await new Promise((resolve) => setTimeout(resolve, Math.max(checkInterval, 2) * 1000)) + break + default: + throw new Error(`Unknown extract status: ${statusData.status}`) + } + } else { + this.handleError(statusResponse, 'check extract status') + } + } + throw new Error('Failed to monitor extract status') } private handleError(response: AxiosResponse, action: string): void { @@ -195,15 +315,15 @@ interface FirecrawlLoaderParameters { url: string apiKey?: string apiUrl?: string - mode?: 'crawl' | 'scrape' + mode?: 'crawl' | 'scrape' | 'extract' | 'getExtractStatus' params?: Record } -class FireCrawlLoader extends BaseDocumentLoader { +export class FireCrawlLoader extends BaseDocumentLoader { private apiKey: string private apiUrl: string private url: string - private mode: 'crawl' | 'scrape' + private mode: 'crawl' | 'scrape' | 'extract' | 'getExtractStatus' private params?: Record constructor(loaderParams: FirecrawlLoaderParameters) { @@ -232,9 +352,35 @@ class FireCrawlLoader extends BaseDocumentLoader { firecrawlDocs = [response.data as FirecrawlDocument] } else if (this.mode === 'crawl') { const response = await app.crawlUrl(this.url, this.params, true) - firecrawlDocs = response as FirecrawlDocument[] + if ('data' in response) { + firecrawlDocs = response.data || [] + } else { + throw new Error('Crawl completed but no data was returned') + } + } else if (this.mode === 'extract') { + this.params!.urls = [this.url] + const response = await app.extract(this.params as any as ExtractRequest) + if (!response.success) { + throw new Error(`Firecrawl: Failed to extract URL.`) + } + firecrawlDocs = [response.data as FirecrawlDocument] + } else if (this.mode === 'getExtractStatus') { + const jobId = this.params?.jobId as string + const response = await app.getExtractStatus(jobId) + if (!response.success) { + throw new Error(`Firecrawl: Failed to get extract status.`) + } + return response.data } else { - throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`) + throw new Error(`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape', 'extract'.`) + } + + if (this.mode === 'extract') { + const newDoc = new Document({ + pageContent: JSON.stringify(firecrawlDocs), + metadata: {} + }) + return [newDoc] } return firecrawlDocs.map( @@ -287,7 +433,7 @@ class FireCrawl_DocumentLoaders implements INode { label: 'URLs', name: 'url', type: 'string', - description: 'URL to be crawled/scraped', + description: 'URL to be crawled/scraped/extracted', placeholder: 'https://docs.flowiseai.com' }, { @@ -304,47 +450,81 @@ class FireCrawl_DocumentLoaders implements INode { label: 'Scrape', name: 'scrape', description: 'Scrape a URL and get its content' + }, + { + label: 'Extract', + name: 'extract', + description: 'Extract data from a URL' + }, + { + label: 'Get extract status (DATA)', + name: 'getExtractStatus', + description: 'Get the status of an extract job' } ], default: 'crawl' }, { - // maxCrawlPages - label: 'Max Crawl Pages', - name: 'maxCrawlPages', + // includeTags + label: '[Scrape] Include Tags', + name: 'includeTags', type: 'string', - description: 'Maximum number of pages to crawl', + description: 'Tags to include in the output', optional: true, additionalParams: true }, { - // generateImgAltText - label: 'Generate Image Alt Text', - name: 'generateImgAltText', - type: 'boolean', - description: 'Generate alt text for images', + // excludeTags + label: '[Scrape] Exclude Tags', + name: 'excludeTags', + type: 'string', + description: 'Tags to exclude from the output', optional: true, additionalParams: true }, { - // returnOnlyUrls - label: 'Return Only URLs', - name: 'returnOnlyUrls', + // onlyMainContent + label: '[Scrape] Only Main Content', + name: 'onlyMainContent', type: 'boolean', - description: 'Return only URLs of the crawled pages', + description: 'Extract only the main content of the page', optional: true, additionalParams: true }, { - // onlyMainContent - label: 'Only Main Content', - name: 'onlyMainContent', - type: 'boolean', - description: 'Extract only the main content of the page', + // limit + label: '[Crawl] Limit', + name: 'limit', + type: 'string', + description: 'Maximum number of pages to crawl', + optional: true, + additionalParams: true, + default: '10000' + }, + { + label: '[Extract] Schema', + name: 'extractSchema', + type: 'json', + description: 'JSON schema for data extraction', + optional: true, + additionalParams: true + }, + { + label: '[Extract] Prompt', + name: 'extractPrompt', + type: 'string', + description: 'Prompt for data extraction', + optional: true, + additionalParams: true + }, + { + label: '[Extract] Job ID', + name: 'extractJobId', + type: 'string', + description: 'ID of the extract job', optional: true, additionalParams: true } - // ... (other input parameters) ] this.outputs = [ { @@ -367,9 +547,7 @@ class FireCrawl_DocumentLoaders implements INode { const metadata = nodeData.inputs?.metadata const url = nodeData.inputs?.url as string const crawlerType = nodeData.inputs?.crawlerType as string - const maxCrawlPages = nodeData.inputs?.maxCrawlPages as string - const generateImgAltText = nodeData.inputs?.generateImgAltText as boolean - const returnOnlyUrls = nodeData.inputs?.returnOnlyUrls as boolean + const limit = nodeData.inputs?.limit as string const onlyMainContent = nodeData.inputs?.onlyMainContent as boolean const credentialData = await getCredentialData(nodeData.credential ?? '', options) const firecrawlApiToken = getCredentialParam('firecrawlApiToken', credentialData, nodeData) @@ -383,22 +561,25 @@ class FireCrawl_DocumentLoaders implements INode { ? (nodeData.inputs.urlPatternsIncludes.split(',') as string[]) : undefined + const extractSchema = nodeData.inputs?.extractSchema + const extractPrompt = nodeData.inputs?.extractPrompt as string + const input: FirecrawlLoaderParameters = { url, - mode: crawlerType as 'crawl' | 'scrape', + mode: crawlerType as 'crawl' | 'scrape' | 'extract' | 'getExtractStatus', apiKey: firecrawlApiToken, apiUrl: firecrawlApiUrl, params: { - crawlerOptions: { - includes: urlPatternsIncludes, - excludes: urlPatternsExcludes, - generateImgAltText, - returnOnlyUrls, - limit: maxCrawlPages ? parseFloat(maxCrawlPages) : undefined + scrapeOptions: { + includePaths: urlPatternsIncludes, + excludePaths: urlPatternsExcludes, + limit: limit ? parseFloat(limit) : 1000, + onlyMainContent, + includeTags: nodeData.inputs?.includeTags, + excludeTags: nodeData.inputs?.excludeTags }, - pageOptions: { - onlyMainContent - } + schema: extractSchema ?? undefined, + prompt: extractPrompt ?? undefined } } const loader = new FireCrawlLoader(input) @@ -440,3 +621,6 @@ class FireCrawl_DocumentLoaders implements INode { } module.exports = { nodeClass: FireCrawl_DocumentLoaders } + +// FOR TESTING PURPOSES +// export { FireCrawl_DocumentLoaders } diff --git a/packages/components/package.json b/packages/components/package.json index 36f16153868..960f1244b85 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -56,7 +56,7 @@ "@langchain/qdrant": "^0.0.5", "@langchain/weaviate": "^0.0.1", "@langchain/xai": "^0.0.1", - "@mendable/firecrawl-js": "^0.0.28", + "@mendable/firecrawl-js": "^1.18.2", "@mistralai/mistralai": "0.1.3", "@modelcontextprotocol/sdk": "^1.6.1", "@modelcontextprotocol/server-brave-search": "^0.6.2",