From b20f303b7ba44988ec303cbf757a7b0d19259d5c Mon Sep 17 00:00:00 2001 From: Gary Conroy Date: Tue, 18 Mar 2025 12:40:06 +0000 Subject: [PATCH 1/9] fix(specs): New Crawler API parameter - ignorePaginationAttributes --- specs/crawler/common/schemas/action.yml | 76 +++++- .../crawler/common/schemas/configuration.yml | 239 ++++++++++++++++-- specs/crawler/spec.yml | 6 +- 3 files changed, 282 insertions(+), 39 deletions(-) diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml index 5703f07210..f13a34380d 100644 --- a/specs/crawler/common/schemas/action.yml +++ b/specs/crawler/common/schemas/action.yml @@ -3,10 +3,14 @@ Action: description: | How to process crawled URLs. + Each action defines: + - The targeted subset of URLs it processes. + - What information to extract from the web pages. + - The Algolia indices where the extracted records will be stored. If a single web page matches several actions, @@ -21,9 +25,15 @@ Action: discoveryPatterns: type: array description: | - Indicates _intermediary_ pages that the crawler should visit. + Which _intermediary_ web pages the crawler should visit. + Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages, _not_ their content. + - For more information, see the [`discoveryPatterns` documentation](https://www.algolia.com/doc/tools/crawler/apis/discoverypatterns/). + It functions similarly to the `pathsToMatch` action but without record extraction. + + + `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards, negation, and other features. + The crawler adds all matching URLs to its queue. items: $ref: '#/urlPattern' fileTypesToMatch: @@ -31,6 +41,7 @@ Action: description: | File types for crawling non-HTML documents. + For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/). maxItems: 100 items: @@ -59,6 +70,7 @@ Action: description: | URLs to which this action should apply. + Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. minItems: 1 maxItems: 100 @@ -69,9 +81,12 @@ Action: type: object description: | Function for extracting information from a crawled page and transforming it into Algolia records for indexing. - The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor` property. - For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/recordextractor/). + + The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor`. + + + For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor). properties: __type: $ref: '#/configurationRecordExtractorType' @@ -110,7 +125,8 @@ ActionSchedule: fileTypes: type: string description: | - Supported file type for indexing non-HTML documents. + Supported file types for indexing non-HTML documents. + For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/). enum: @@ -129,6 +145,7 @@ urlPattern: description: | Pattern for matching URLs. + Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. example: https://www.algolia.com/** @@ -140,7 +157,43 @@ hostnameAliases: Key-value pairs to replace matching hostnames found in a sitemap, on a page, in canonical links, or redirects. - For more information, see the [`hostnameAliases` documentation](https://www.algolia.com/doc/tools/crawler/apis/hostnamealiases/). + + During a crawl, this action maps one hostname to another whenever the crawler encounters specific URLs. + This helps with links to staging environments (like `dev.example.com`) or external hosting services (such as YouTube). + + + For example, with this `hostnameAliases` mapping: + + { + hostnameAliases: { + 'dev.example.com': 'example.com' + } + } + + 1. The crawler encounters `https://dev.example.com/solutions/voice-search/`. + + 1. `hostnameAliases` transforms the URL to `https://example.com/solutions/voice-search/`. + + 1. The crawler follows the transformed URL (not the original). + + + **`hostnameAliases` only changes URLs, not page text. In the preceding example, if the extracted text contains the string `dev.example.com`, it remains unchanged.** + + + The crawler can discover URLs in places such as: + + + - Crawled pages + + - Sitemaps + + - [Canonical URLs](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior) + + - Redirects. + + + However, `hostnameAliases` doesn't transform URLs you explicitly set in the `startUrls` or `sitemaps` parameters, + nor does it affect the `pathsToMatch` action or other configuration elements. additionalProperties: type: string description: Hostname that should be added in the records. @@ -153,12 +206,16 @@ pathAliases: '/foo': '/bar' description: | Key-value pairs to replace matching paths with new values. + It doesn't replace: - + + - URLs in the `startUrls`, `sitemaps`, `pathsToMatch`, and other settings. + - Paths found in extracted text. + The crawl continues from the _transformed_ URLs. additionalProperties: type: object @@ -172,9 +229,10 @@ pathAliases: cache: type: object description: | - Whether the crawler should cache crawled pages. + Whether the crawler should cache crawled pages. + - For more information, see the [`cache` documentation](https://www.algolia.com/doc/tools/crawler/apis/cache/). + For more information, see [Partial crawls with caching](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#partial-crawls-with-caching). properties: enabled: type: boolean diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml index e87bfd6418..b95fc81c3f 100644 --- a/specs/crawler/common/schemas/configuration.yml +++ b/specs/crawler/common/schemas/configuration.yml @@ -16,9 +16,20 @@ Configuration: apiKey: type: string description: | - Algolia API key for indexing the records. + The Algolia API key the crawler uses for indexing records. + If you don't provide an API key, one will be generated by the Crawler when you create a configuration. - For more information, see the [`apiKey` documentation](https://www.algolia.com/doc/tools/crawler/apis/apikey/). + + The API key must have: + + + - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse` + + - Access to the correct set of indices, based on the crawler's `indexPrefix`. + For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`) + + + **You can't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).** appId: $ref: '../parameters.yml#/applicationID' exclusionPatterns: @@ -34,12 +45,14 @@ Configuration: description: | URLs to exclude from crawling. + Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. externalData: type: array description: | References to external data sources for enriching the extracted records. + For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/). maxItems: 10 items: @@ -59,22 +72,44 @@ Configuration: ignoreNoFollowTo: type: boolean description: | - Whether to ignore the `nofollow` meta tag or link attribute. + Determines if the crawler should follow links with a `nofollow` directive. + If `true`, the crawler will ignore the `nofollow` directive and crawl links on the page. + + + The crawler always ignores links that don't match your [configuration settings](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#exclude-and-include-content). + + + `ignoreNoFollowTo` applies to: + - For more information, see the [`ignoreNoFollowTo` documentation](https://www.algolia.com/doc/tools/crawler/apis/ignorenofollowto/). + - Links that are ignored because the [`robots` meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names) contains `nofollow` or `none`. + + - Links with a [`rel` attribute](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel) containing the `nofollow` directive. ignoreNoIndex: type: boolean description: | Whether to ignore the `noindex` robots meta tag. If `true` pages with this meta tag _will_ be crawled. + ignorePaginationAttributes: + type: boolean + description: | + Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in your `` HTML. + + + - If `true`, the crawler ignores the pagination links. + + - If `false`, the crawler follows the pagination links. + default: true ignoreQueryParams: type: array description: | Query parameters to ignore while crawling. + All URLs with the matching query parameters will be treated as identical. This prevents indexing URLs that just differ by their query parameters. + You can use wildcard characters to pattern match. maxItems: 9999 example: @@ -96,7 +131,10 @@ Configuration: description: | Crawler index settings. + These index settings are only applied during the first crawl of an index. + + Any subsequent changes won't be applied to the index. Instead, make changes to your index settings in the [Algolia dashboard](https://dashboard.algolia.com/explorer/configuration/). additionalProperties: @@ -108,7 +146,8 @@ Configuration: description: | Function for extracting URLs from links on crawled pages. - For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/linkextractor/). + + For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/link-extractor/). properties: __type: $ref: './action.yml#/configurationRecordExtractorType' @@ -126,34 +165,85 @@ Configuration: login: $ref: '#/login' maxDepth: - type: number + type: integer description: | - Maximum path depth of crawled URLs. - For example, if `maxDepth` is 2, `https://example.com/foo/bar` is crawled, - but `https://example.com/foo/bar/baz` won't. - Trailing slashes increase the URL depth. + Determines the maximum path depth of crawled URLs. + + + Path depth is calculated based on the number of slash characters (`/`) after the domain (starting at 1). + For example: + + + **1** `http://example.com` + + + **1** `http://example.com/` + + + **1** `http://example.com/foo` + + + **2** `http://example.com/foo/` + + + **2** `http://example.com/foo/bar` + + + **3** `http://example.com/foo/bar/` + + + **URLs added with `startUrls` and `sitemaps` aren't checked for `maxDepth`.** minimum: 1 maximum: 100 + example: 5 maxUrls: - type: number + type: integer description: | Limits the number of URLs your crawler processes. + Change it to a low value, such as 100, for quick crawling tests. + + Change it to a higher explicit value for full crawls to prevent it from getting "lost" in complex site structures. + Because the Crawler works on many pages simultaneously, `maxUrls` doesn't guarantee finding the same pages each time it runs. minimum: 1 maximum: 15000000 + example: 250 rateLimit: - type: number + type: integer description: | - Number of concurrent tasks per second. + Determines the number of concurrent tasks per second that can run for this configuration. + + + A higher rate limit means more crawls per second. + + + Algolia prevents system overload by ensuring the number of URLs added in the last second and the number of URLs being processed is less than the rate limit: + + + ``` + max(new_urls_added, active_urls_processing) <= rateLimit + ``` + - If processing each URL takes _n_ seconds, - your crawler can process `rateLimit / n` URLs per second. + Start with a low value (for example, 2) and increase it if you need faster crawling. + Be aware that a high `rateLimit` can have a huge impact on bandwidth cost and server resource consumption. - Higher numbers mean faster crawls but they also increase your bandwidth and server load. + + The number of pages processed per second depends on the average time it takes to fetch, process, and upload a URL. + For a given `rateLimit` if fetching, processing, and uploading URLs takes (on average): + + + - Less than a second, your crawler processes up to `rateLimit` pages per second. + + - Four seconds, your crawler processes up to `rateLimit / 4` pages per second. + + + In the latter case, increasing `rateLimit` improves performance, up to a point. + However, if the processing time remains at four seconds, increasing `rateLimit` won't increase the number of pages processed per second. minimum: 1 maximum: 100 example: 4 @@ -264,11 +354,11 @@ requestOptions: type: string description: Proxy for all crawler requests. timeout: - type: number + type: integer default: 30000 description: Timeout in milliseconds for the crawl. retries: - type: number + type: integer default: 3 description: Maximum number of retries to crawl one URL. headers: @@ -279,12 +369,12 @@ waitTime: description: Timeout for the HTTP request. properties: min: - type: number + type: integer default: 0 description: Minimum waiting time in milliseconds. example: 7000 max: - type: number + type: integer default: 20000 description: Maximum waiting time in milliseconds. example: 15000 @@ -312,13 +402,64 @@ headers: example: session=1234 login: - description: Authorization method and credentials for crawling protected content. + description: > + Authorization method and credentials for crawling protected content. + + + The Crawler has several authentication methods for accessing protected content: + + + - **Basic authentication.** The Crawler obtains a session cookie from the login page. + + - **OAuth 2.0 authentication** (`oauthRequest`). The Crawler uses OAuth 2.0 client credentials to obtain an access token for authentication. + + + **Basic authentication** + + + The Crawler extracts the `Set-Cookie` response header from the login page, stores that cookie, and sends it in the `Cookie` header when crawling all pages defined in the configuration. + + + This cookie is retrieved only at the start of each full crawl. + If it expires, it isn't automatically renewed. + + + The Crawler can obtain the session cookie in one of two ways: + + + - **HTTP request authentication** (`fetchRequest`). The Crawler sends a direct request with your credentials to the login endpoint, similar to a `curl` command. + + - **Browser-based authentication** (`browserRequest`). The Crawler emulates a web browser by loading the login page, entering the credentials, and submitting the login form as a real user would. + + + **OAuth 2.0** + + + The crawler supports [OAuth 2.0 client credentials grant flow](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4): + + + 1. It performs an access token request with the provided credentials + + 1. Stores the fetched token in an `Authorization` header + + 1. Sends the token when crawling site pages. + + + > This token is only fetched at the beginning of each complete crawl. + If it expires, it isn't automatically renewed. + + + Client authentication passes the credentials (`client_id` and `client_secret`) [in the request body](https://datatracker.ietf.org/doc/html/rfc6749#section-2.3.1). + + + The [Azure AD v1.0](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) provider is supported. oneOf: - $ref: '#/fetchRequest' - $ref: '#/browserRequest' - $ref: '#/oauthRequest' fetchRequest: + title: HTTP request type: object description: Information for making a HTTP request for authorization. properties: @@ -330,8 +471,17 @@ fetchRequest: $ref: '#/loginRequestOptions' required: - url + example: + url: "https://example.com/secure/login-with-post" + requestOptions: + method: "POST" + headers: + Content-Type: "application/x-www-form-urlencoded" + body: "id=my-id&password=my-password" + timeout: 5000 browserRequest: + title: Browser-based type: object description: | Information for using a web browser for authorization. @@ -342,6 +492,7 @@ browserRequest: description: | URL of your login page. + The crawler looks for an input matching the selector `input[type=text]` or `input[type=email]` for the username and `input[type=password]` for the password. example: https://example.com/login username: @@ -358,17 +509,32 @@ browserRequest: - url - username - password + example: + url: "https://example.com/secure/login-page" + username: "my-id" + password: "my-password" oauthRequest: + title: OAuth 2.0 type: object description: | Authorization information for using the [OAuth 2.0 client credentials](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4) authorization grant. + + OAuth authorization is supported for [Azure Active Directory version 1](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) as provider. properties: accessTokenRequest: $ref: '#/accessTokenRequest' required: - accessTokenRequest + example: + accessTokenRequest: + url: "https://example.com/oauth2/token" + grant_type: "client_credentials" + client_id: "my-client-id" + client_secret: "my-client-secret" + extraParameters: + resource: "https://protected.example.com/" loginRequestOptions: type: object @@ -386,7 +552,7 @@ loginRequestOptions: description: Form content. example: 'id=user&password=s3cr3t' timeout: - type: number + type: integer description: Timeout for the request. accessTokenRequest: @@ -433,13 +599,14 @@ extraParameters: description: | App ID URI of the receiving web service. - For more information, see [Azure Active Directory](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow#first-case-access-token-request-with-a-shared-secret). + For more information, see [Azure Active Directory](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow#first-case-access-token-request-with-a-shared-secret). safetyChecks: type: object description: | Checks to ensure the crawl was successful. + For more information, see the [Safety checks](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#safety-checks) documentation. properties: beforeIndexPublishing: @@ -450,13 +617,13 @@ beforeIndexPublishing: description: Checks triggered after the crawl finishes but before the records are added to the Algolia index. properties: maxLostRecordsPercentage: - type: number + type: integer description: Maximum difference in percent between the numbers of records between crawls. minimum: 1 maximum: 100 default: 10 maxFailedUrls: - type: number + type: integer description: Stops the crawler if a specified number of pages fail to crawl. schedule: @@ -464,5 +631,25 @@ schedule: description: | Schedule for running the crawl. - For more information, see the [`schedule` documentation](https://www.algolia.com/doc/tools/crawler/apis/schedule/). + + Instead of manually starting a crawl each time, you can set up a schedule for automatic crawls. + + + [Use the visual UI](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration-visual/) or add the `schedule` parameter to [your configuration](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/). + + + `schedule` uses [Later.js syntax](https://bunkat.github.io/later/) to specify when to crawl your site. + Here are some key things to keep in mind when using `Later.js` syntax with the Crawler: + + - The interval between two scheduled crawls must be at least 24 hours. + + - To crawl daily, use "every 1 day" instead of "everyday" or "every day". + + - If you don't specify a time, the crawl can happen any time during the scheduled day. + + - Specify times for the UTC (GMT+0) timezone + + - Include minutes when specifying a time. For example, "at 3:00 pm" instead of "at 3pm". + + - Use "at 12:00 am" to specify midnight, not "at 00:00 am". example: every weekday at 12:00 pm diff --git a/specs/crawler/spec.yml b/specs/crawler/spec.yml index d6b69e9966..1af804a0bb 100644 --- a/specs/crawler/spec.yml +++ b/specs/crawler/spec.yml @@ -13,14 +13,12 @@ info: **All requests must use HTTPS.** ## Availability and authentication - - Access to the Crawler API is available with the [Crawler add-on](https://www.algolia.com/pricing/). - + To authenticate your API requests, use the **basic authentication** header: - `Authorization: Basic ` - where `` is a base64-encoded string `:`. + Where `` is a base64-encoded string `:`. - ``. The Crawler user ID. - ``. The Crawler API key. From 103ffe0dd71675d7e67b17db65e3877e33368017 Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 10:00:10 +0000 Subject: [PATCH 2/9] Example for pathAliases --- specs/crawler/common/schemas/action.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml index f13a34380d..7571815aad 100644 --- a/specs/crawler/common/schemas/action.yml +++ b/specs/crawler/common/schemas/action.yml @@ -32,7 +32,7 @@ Action: It functions similarly to the `pathsToMatch` action but without record extraction. - `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards, negation, and other features. + Uses [micromatch](https://github.com/micromatch/micromatch) to match wildcards, negation, and other features. The crawler adds all matching URLs to its queue. items: $ref: '#/urlPattern' @@ -217,6 +217,14 @@ pathAliases: The crawl continues from the _transformed_ URLs. + + + For example, if you create a mapping for `{ "dev.example.com": { '/foo': '/bar' } }` and the crawler encounters `https://dev.example.com/foo/hello/`, + it’s transformed to `https://dev.example.com/bar/hello/`. + + + > Compare with the `hostnameAliases` action. + additionalProperties: type: object description: Hostname for which matching paths should be replaced. From 17bbe04c546a5c14c1e8594e4821c04178e14496 Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 10:10:17 +0000 Subject: [PATCH 3/9] Typos --- specs/crawler/common/schemas/configuration.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml index b95fc81c3f..7c037c5bf6 100644 --- a/specs/crawler/common/schemas/configuration.yml +++ b/specs/crawler/common/schemas/configuration.yml @@ -23,13 +23,14 @@ Configuration: The API key must have: - - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse` + - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): + `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`. - Access to the correct set of indices, based on the crawler's `indexPrefix`. - For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`) + For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`. - **You can't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).** + **Don't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).** appId: $ref: '../parameters.yml#/applicationID' exclusionPatterns: @@ -287,9 +288,9 @@ ignoreCanonicalTo: oneOf: - type: boolean description: | - Determines if the crawler should extract records from a page with a [canonical URL](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behaviorr). + Determines if the crawler should extract records from a page with a [canonical URL](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior). - If ignoreCanonicalTo is set to: + If `ignoreCanonicalTo` is set to: - `true` all canonical URLs are ignored. - One or more URL patterns, the crawler will ignore the canonical URL if it matches a pattern. From c20759321024fe75098c7005fc48667048a141c2 Mon Sep 17 00:00:00 2001 From: Kai Welke Date: Wed, 19 Mar 2025 11:10:01 +0100 Subject: [PATCH 4/9] fix: whitespace --- specs/crawler/common/schemas/action.yml | 74 +-------- .../crawler/common/schemas/configuration.yml | 140 +++++------------- 2 files changed, 43 insertions(+), 171 deletions(-) diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml index 7571815aad..91f66ba041 100644 --- a/specs/crawler/common/schemas/action.yml +++ b/specs/crawler/common/schemas/action.yml @@ -3,14 +3,10 @@ Action: description: | How to process crawled URLs. - Each action defines: - - The targeted subset of URLs it processes. - - What information to extract from the web pages. - - The Algolia indices where the extracted records will be stored. If a single web page matches several actions, @@ -26,14 +22,12 @@ Action: type: array description: | Which _intermediary_ web pages the crawler should visit. - Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages, _not_ their content. - - + Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages, + _not_ their content. It functions similarly to the `pathsToMatch` action but without record extraction. - - Uses [micromatch](https://github.com/micromatch/micromatch) to match wildcards, negation, and other features. - The crawler adds all matching URLs to its queue. + `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards, + negation, and other features. items: $ref: '#/urlPattern' fileTypesToMatch: @@ -41,7 +35,6 @@ Action: description: | File types for crawling non-HTML documents. - For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/). maxItems: 100 items: @@ -70,7 +63,6 @@ Action: description: | URLs to which this action should apply. - Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. minItems: 1 maxItems: 100 @@ -82,11 +74,8 @@ Action: description: | Function for extracting information from a crawled page and transforming it into Algolia records for indexing. - The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor`. - - - For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor). + For details, see the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor). properties: __type: $ref: '#/configurationRecordExtractorType' @@ -127,7 +116,6 @@ fileTypes: description: | Supported file types for indexing non-HTML documents. - For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/). enum: - doc @@ -145,7 +133,6 @@ urlPattern: description: | Pattern for matching URLs. - Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. example: https://www.algolia.com/** @@ -153,47 +140,7 @@ hostnameAliases: type: object example: 'dev.example.com': 'example.com' - description: | - Key-value pairs to replace matching hostnames found in a sitemap, - on a page, in canonical links, or redirects. - - - During a crawl, this action maps one hostname to another whenever the crawler encounters specific URLs. - This helps with links to staging environments (like `dev.example.com`) or external hosting services (such as YouTube). - - - For example, with this `hostnameAliases` mapping: - - { - hostnameAliases: { - 'dev.example.com': 'example.com' - } - } - - 1. The crawler encounters `https://dev.example.com/solutions/voice-search/`. - - 1. `hostnameAliases` transforms the URL to `https://example.com/solutions/voice-search/`. - - 1. The crawler follows the transformed URL (not the original). - - - **`hostnameAliases` only changes URLs, not page text. In the preceding example, if the extracted text contains the string `dev.example.com`, it remains unchanged.** - - - The crawler can discover URLs in places such as: - - - - Crawled pages - - - Sitemaps - - - [Canonical URLs](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior) - - - Redirects. - - - However, `hostnameAliases` doesn't transform URLs you explicitly set in the `startUrls` or `sitemaps` parameters, - nor does it affect the `pathsToMatch` action or other configuration elements. + description: "Key-value pairs to replace matching hostnames found in a sitemap,\non a page, in canonical links, or redirects.\n\n\nDuring a crawl, this action maps one hostname to another whenever the crawler encounters specific URLs.\nThis helps with links to staging environments (like `dev.example.com`) or external hosting services (such as YouTube).\n\n\nFor example, with this `hostnameAliases` mapping:\n\n {\n hostnameAliases: {\n 'dev.example.com': 'example.com'\n }\n }\n\n1. The crawler encounters `https://dev.example.com/solutions/voice-search/`.\n\n1. `hostnameAliases` transforms the URL to `https://example.com/solutions/voice-search/`.\n\n1. The crawler follows the transformed URL (not the original).\n\n\n**`hostnameAliases` only changes URLs, not page text. In the preceding example, if the extracted text contains the string `dev.example.com`, it remains unchanged.**\n\n\nThe crawler can discover URLs in places such as:\n\n\n- Crawled pages\n\n- Sitemaps\n\n- [Canonical URLs](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior)\n\n- Redirects. \n\n\nHowever, `hostnameAliases` doesn't transform URLs you explicitly set in the `startUrls` or `sitemaps` parameters,\nnor does it affect the `pathsToMatch` action or other configuration elements.\n" additionalProperties: type: string description: Hostname that should be added in the records. @@ -207,15 +154,11 @@ pathAliases: description: | Key-value pairs to replace matching paths with new values. - It doesn't replace: - - URLs in the `startUrls`, `sitemaps`, `pathsToMatch`, and other settings. - - Paths found in extracted text. - The crawl continues from the _transformed_ URLs. @@ -237,10 +180,9 @@ pathAliases: cache: type: object description: | - Whether the crawler should cache crawled pages. - + Whether the crawler should cache crawled pages. - For more information, see [Partial crawls with caching](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#partial-crawls-with-caching). + For more information, see [Partial crawls with caching](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#partial-crawls-with-caching). properties: enabled: type: boolean diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml index 7c037c5bf6..e4c978a546 100644 --- a/specs/crawler/common/schemas/configuration.yml +++ b/specs/crawler/common/schemas/configuration.yml @@ -19,18 +19,13 @@ Configuration: The Algolia API key the crawler uses for indexing records. If you don't provide an API key, one will be generated by the Crawler when you create a configuration. - The API key must have: - - - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): - `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`. - + - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse` - Access to the correct set of indices, based on the crawler's `indexPrefix`. For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`. - - **Don't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).** + **Don't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys)**. appId: $ref: '../parameters.yml#/applicationID' exclusionPatterns: @@ -46,14 +41,12 @@ Configuration: description: | URLs to exclude from crawling. - Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. externalData: type: array description: | References to external data sources for enriching the extracted records. - For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/). maxItems: 10 items: @@ -76,15 +69,10 @@ Configuration: Determines if the crawler should follow links with a `nofollow` directive. If `true`, the crawler will ignore the `nofollow` directive and crawl links on the page. - The crawler always ignores links that don't match your [configuration settings](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#exclude-and-include-content). - - `ignoreNoFollowTo` applies to: - - Links that are ignored because the [`robots` meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names) contains `nofollow` or `none`. - - Links with a [`rel` attribute](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel) containing the `nofollow` directive. ignoreNoIndex: type: boolean @@ -96,9 +84,7 @@ Configuration: description: | Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in your `` HTML. - - If `true`, the crawler ignores the pagination links. - - If `false`, the crawler follows the pagination links. default: true ignoreQueryParams: @@ -106,11 +92,9 @@ Configuration: description: | Query parameters to ignore while crawling. - All URLs with the matching query parameters will be treated as identical. This prevents indexing URLs that just differ by their query parameters. - You can use wildcard characters to pattern match. maxItems: 9999 example: @@ -132,10 +116,8 @@ Configuration: description: | Crawler index settings. - These index settings are only applied during the first crawl of an index. - Any subsequent changes won't be applied to the index. Instead, make changes to your index settings in the [Algolia dashboard](https://dashboard.algolia.com/explorer/configuration/). additionalProperties: @@ -147,7 +129,6 @@ Configuration: description: | Function for extracting URLs from links on crawled pages. - For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/link-extractor/). properties: __type: @@ -170,30 +151,17 @@ Configuration: description: | Determines the maximum path depth of crawled URLs. - Path depth is calculated based on the number of slash characters (`/`) after the domain (starting at 1). For example: + - **1** `http://example.com` + - **1** `http://example.com/` + - **1** `http://example.com/foo` + - **2** `http://example.com/foo/` + - **2** `http://example.com/foo/bar` + - **3** `http://example.com/foo/bar/` - **1** `http://example.com` - - - **1** `http://example.com/` - - - **1** `http://example.com/foo` - - - **2** `http://example.com/foo/` - - - **2** `http://example.com/foo/bar` - - - **3** `http://example.com/foo/bar/` - - - **URLs added with `startUrls` and `sitemaps` aren't checked for `maxDepth`.** + **URLs added with `startUrls` and `sitemaps` aren't checked for `maxDepth`.**. minimum: 1 maximum: 100 example: 5 @@ -202,13 +170,8 @@ Configuration: description: | Limits the number of URLs your crawler processes. - Change it to a low value, such as 100, for quick crawling tests. - - Change it to a higher explicit value for full crawls to prevent it from getting "lost" in complex site structures. - - Because the Crawler works on many pages simultaneously, `maxUrls` doesn't guarantee finding the same pages each time it runs. minimum: 1 maximum: 15000000 @@ -218,10 +181,7 @@ Configuration: description: | Determines the number of concurrent tasks per second that can run for this configuration. - A higher rate limit means more crawls per second. - - Algolia prevents system overload by ensuring the number of URLs added in the last second and the number of URLs being processed is less than the rate limit: @@ -229,20 +189,15 @@ Configuration: max(new_urls_added, active_urls_processing) <= rateLimit ``` - Start with a low value (for example, 2) and increase it if you need faster crawling. Be aware that a high `rateLimit` can have a huge impact on bandwidth cost and server resource consumption. - The number of pages processed per second depends on the average time it takes to fetch, process, and upload a URL. For a given `rateLimit` if fetching, processing, and uploading URLs takes (on average): - - Less than a second, your crawler processes up to `rateLimit` pages per second. - - Four seconds, your crawler processes up to `rateLimit / 4` pages per second. - In the latter case, increasing `rateLimit` improves performance, up to a point. However, if the processing time remains at four seconds, increasing `rateLimit` won't increase the number of pages processed per second. minimum: 1 @@ -403,56 +358,44 @@ headers: example: session=1234 login: - description: > + description: | Authorization method and credentials for crawling protected content. + The Crawler supports these authentication methods: - The Crawler has several authentication methods for accessing protected content: - - - - **Basic authentication.** The Crawler obtains a session cookie from the login page. - - - **OAuth 2.0 authentication** (`oauthRequest`). The Crawler uses OAuth 2.0 client credentials to obtain an access token for authentication. - + - **Basic authentication**. + The Crawler obtains a session cookie from the login page. + - **OAuth 2.0 authentication** (`oauthRequest`). + The Crawler uses OAuth 2.0 client credentials to obtain an access token for authentication. **Basic authentication** - - The Crawler extracts the `Set-Cookie` response header from the login page, stores that cookie, and sends it in the `Cookie` header when crawling all pages defined in the configuration. - + The Crawler extracts the `Set-Cookie` response header from the login page, stores that cookie, + and sends it in the `Cookie` header when crawling all pages defined in the configuration. This cookie is retrieved only at the start of each full crawl. If it expires, it isn't automatically renewed. - The Crawler can obtain the session cookie in one of two ways: - - - **HTTP request authentication** (`fetchRequest`). The Crawler sends a direct request with your credentials to the login endpoint, similar to a `curl` command. - - - **Browser-based authentication** (`browserRequest`). The Crawler emulates a web browser by loading the login page, entering the credentials, and submitting the login form as a real user would. - + - **HTTP request authentication** (`fetchRequest`). + The Crawler sends a direct request with your credentials to the login endpoint, similar to a `curl` command. + - **Browser-based authentication** (`browserRequest`). + The Crawler emulates a web browser by loading the login page, entering the credentials, + and submitting the login form as a real user would. **OAuth 2.0** - The crawler supports [OAuth 2.0 client credentials grant flow](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4): - 1. It performs an access token request with the provided credentials - 1. Stores the fetched token in an `Authorization` header - 1. Sends the token when crawling site pages. - - > This token is only fetched at the beginning of each complete crawl. + This token is only fetched at the beginning of each complete crawl. If it expires, it isn't automatically renewed. - Client authentication passes the credentials (`client_id` and `client_secret`) [in the request body](https://datatracker.ietf.org/doc/html/rfc6749#section-2.3.1). - - The [Azure AD v1.0](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) provider is supported. oneOf: - $ref: '#/fetchRequest' @@ -473,12 +416,12 @@ fetchRequest: required: - url example: - url: "https://example.com/secure/login-with-post" + url: 'https://example.com/secure/login-with-post' requestOptions: - method: "POST" + method: 'POST' headers: - Content-Type: "application/x-www-form-urlencoded" - body: "id=my-id&password=my-password" + Content-Type: 'application/x-www-form-urlencoded' + body: 'id=my-id&password=my-password' timeout: 5000 browserRequest: @@ -493,7 +436,6 @@ browserRequest: description: | URL of your login page. - The crawler looks for an input matching the selector `input[type=text]` or `input[type=email]` for the username and `input[type=password]` for the password. example: https://example.com/login username: @@ -511,9 +453,9 @@ browserRequest: - username - password example: - url: "https://example.com/secure/login-page" - username: "my-id" - password: "my-password" + url: 'https://example.com/secure/login-page' + username: 'my-id' + password: 'my-password' oauthRequest: title: OAuth 2.0 @@ -521,7 +463,6 @@ oauthRequest: description: | Authorization information for using the [OAuth 2.0 client credentials](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4) authorization grant. - OAuth authorization is supported for [Azure Active Directory version 1](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) as provider. properties: accessTokenRequest: @@ -530,12 +471,12 @@ oauthRequest: - accessTokenRequest example: accessTokenRequest: - url: "https://example.com/oauth2/token" - grant_type: "client_credentials" - client_id: "my-client-id" - client_secret: "my-client-secret" + url: 'https://example.com/oauth2/token' + grant_type: 'client_credentials' + client_id: 'my-client-id' + client_secret: 'my-client-secret' extraParameters: - resource: "https://protected.example.com/" + resource: 'https://protected.example.com/' loginRequestOptions: type: object @@ -600,14 +541,12 @@ extraParameters: description: | App ID URI of the receiving web service. - For more information, see [Azure Active Directory](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow#first-case-access-token-request-with-a-shared-secret). safetyChecks: type: object description: | Checks to ensure the crawl was successful. - For more information, see the [Safety checks](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#safety-checks) documentation. properties: beforeIndexPublishing: @@ -632,25 +571,16 @@ schedule: description: | Schedule for running the crawl. - Instead of manually starting a crawl each time, you can set up a schedule for automatic crawls. - - [Use the visual UI](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration-visual/) or add the `schedule` parameter to [your configuration](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/). - `schedule` uses [Later.js syntax](https://bunkat.github.io/later/) to specify when to crawl your site. Here are some key things to keep in mind when using `Later.js` syntax with the Crawler: - The interval between two scheduled crawls must be at least 24 hours. - - To crawl daily, use "every 1 day" instead of "everyday" or "every day". - - If you don't specify a time, the crawl can happen any time during the scheduled day. - - Specify times for the UTC (GMT+0) timezone - - Include minutes when specifying a time. For example, "at 3:00 pm" instead of "at 3pm". - - Use "at 12:00 am" to specify midnight, not "at 00:00 am". example: every weekday at 12:00 pm From 5aab7435f54582742eab2567dff28773ddc08dd1 Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 10:15:03 +0000 Subject: [PATCH 5/9] Remove "add-on" statement --- specs/crawler/common/parameters.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/specs/crawler/common/parameters.yml b/specs/crawler/common/parameters.yml index 4bf171bbdc..9b43886129 100644 --- a/specs/crawler/common/parameters.yml +++ b/specs/crawler/common/parameters.yml @@ -54,7 +54,6 @@ applicationID: type: string description: | Algolia application ID where the crawler creates and updates indices. - The Crawler add-on must be enabled for this application. CrawlerID: type: string From d814102f9587770c9cb206d0caa361d297a65de0 Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 11:37:03 +0000 Subject: [PATCH 6/9] Clarifications --- specs/crawler/common/schemas/configuration.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml index e4c978a546..a5cb1bb760 100644 --- a/specs/crawler/common/schemas/configuration.yml +++ b/specs/crawler/common/schemas/configuration.yml @@ -78,11 +78,11 @@ Configuration: type: boolean description: | Whether to ignore the `noindex` robots meta tag. - If `true` pages with this meta tag _will_ be crawled. + If `true`, pages with this meta tag _will_ be crawled. ignorePaginationAttributes: type: boolean description: | - Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in your `` HTML. + Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in the `` section of an HTML page. - If `true`, the crawler ignores the pagination links. - If `false`, the crawler follows the pagination links. @@ -92,10 +92,10 @@ Configuration: description: | Query parameters to ignore while crawling. - All URLs with the matching query parameters will be treated as identical. + All URLs with the matching query parameters are treated as identical. This prevents indexing URLs that just differ by their query parameters. - You can use wildcard characters to pattern match. + Use wildcards to match multiple query parameters. maxItems: 9999 example: - ref From 797677f0d3d68c63481c35b06809ba1bd411b8e0 Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 11:58:30 +0000 Subject: [PATCH 7/9] Remove duplicated text from actions --- specs/crawler/common/schemas/action.yml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml index 91f66ba041..e0b9583323 100644 --- a/specs/crawler/common/schemas/action.yml +++ b/specs/crawler/common/schemas/action.yml @@ -25,17 +25,12 @@ Action: Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages, _not_ their content. It functions similarly to the `pathsToMatch` action but without record extraction. - - `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards, - negation, and other features. items: $ref: '#/urlPattern' fileTypesToMatch: type: array description: | File types for crawling non-HTML documents. - - For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/). maxItems: 100 items: $ref: '#/fileTypes' @@ -91,7 +86,8 @@ Action: maxItems: 100 items: type: string - description: DOM selector. Negation is supported. This lets you ignore pages that match the selector. + description: | + Prefix a selector with `!` to ignore matching pages. example: - .products - '!.featured' @@ -114,8 +110,6 @@ ActionSchedule: fileTypes: type: string description: | - Supported file types for indexing non-HTML documents. - For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/). enum: - doc @@ -131,9 +125,7 @@ fileTypes: urlPattern: type: string description: | - Pattern for matching URLs. - - Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. + Use [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. example: https://www.algolia.com/** hostnameAliases: From 6ccfffc1511e053875c64ba5fc6e996ea579a100 Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 12:05:58 +0000 Subject: [PATCH 8/9] Removed duplicate text from configuration parameters --- specs/crawler/common/schemas/configuration.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml index a5cb1bb760..2380a7fc6d 100644 --- a/specs/crawler/common/schemas/configuration.yml +++ b/specs/crawler/common/schemas/configuration.yml @@ -39,19 +39,15 @@ Configuration: items: type: string description: | - URLs to exclude from crawling. - - Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. + Use [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more. externalData: type: array description: | References to external data sources for enriching the extracted records. - - For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/). maxItems: 10 items: type: string - description: Reference to an external data source you configured in the Crawler dashboard. + description: For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/). example: testCSV extraUrls: type: array @@ -94,15 +90,13 @@ Configuration: All URLs with the matching query parameters are treated as identical. This prevents indexing URLs that just differ by their query parameters. - - Use wildcards to match multiple query parameters. maxItems: 9999 example: - ref - utm_* items: type: string - description: Query parameter to ignore. You can include wildcards to match a range of similar query parameters. + description: Use wildcards to match multiple query parameters. ignoreRobotsTxtRules: type: boolean description: Whether to ignore rules defined in your `robots.txt` file. From 44b852a9d8eb418d51022289c1bb40ab47c1746c Mon Sep 17 00:00:00 2001 From: gazconroy Date: Wed, 19 Mar 2025 13:33:28 +0000 Subject: [PATCH 9/9] Added schedule action --- specs/crawler/common/schemas/action.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml index e0b9583323..00206d3924 100644 --- a/specs/crawler/common/schemas/action.yml +++ b/specs/crawler/common/schemas/action.yml @@ -78,6 +78,12 @@ Action: type: string description: | A JavaScript function (as a string) that returns one or more Algolia records for each crawled page. + schedule: + type: string + description: | + How often to perform a complete crawl for this action. + + For mopre information, consult the [`schedule` parameter documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/schedule/). selectorsToMatch: type: array description: |