From b20f303b7ba44988ec303cbf757a7b0d19259d5c Mon Sep 17 00:00:00 2001
From: Gary Conroy <gary.conroy@LON-M3P-GConroy.local>
Date: Tue, 18 Mar 2025 12:40:06 +0000
Subject: [PATCH 1/9] fix(specs): New Crawler API parameter -
 ignorePaginationAttributes

---
 specs/crawler/common/schemas/action.yml       |  76 +++++-
 .../crawler/common/schemas/configuration.yml  | 239 ++++++++++++++++--
 specs/crawler/spec.yml                        |   6 +-
 3 files changed, 282 insertions(+), 39 deletions(-)

diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml
index 5703f07210..f13a34380d 100644
--- a/specs/crawler/common/schemas/action.yml
+++ b/specs/crawler/common/schemas/action.yml
@@ -3,10 +3,14 @@ Action:
   description: |
     How to process crawled URLs.
 
+
     Each action defines:
 
+
     - The targeted subset of URLs it processes.
+
     - What information to extract from the web pages.
+
     - The Algolia indices where the extracted records will be stored.
 
     If a single web page matches several actions,
@@ -21,9 +25,15 @@ Action:
     discoveryPatterns:
       type: array
       description: |
-        Indicates _intermediary_ pages that the crawler should visit.
+        Which _intermediary_ web pages the crawler should visit.
+        Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages, _not_ their content.
+
 
-        For more information, see the [`discoveryPatterns` documentation](https://www.algolia.com/doc/tools/crawler/apis/discoverypatterns/).
+        It functions similarly to the `pathsToMatch` action but without record extraction.
+
+
+        `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards, negation, and other features. 
+        The crawler adds all matching URLs to its queue.
       items:
         $ref: '#/urlPattern'
     fileTypesToMatch:
@@ -31,6 +41,7 @@ Action:
       description: |
         File types for crawling non-HTML documents.
 
+
         For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/).
       maxItems: 100
       items:
@@ -59,6 +70,7 @@ Action:
       description: |
         URLs to which this action should apply.
 
+
         Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
       minItems: 1
       maxItems: 100
@@ -69,9 +81,12 @@ Action:
       type: object
       description: |
         Function for extracting information from a crawled page and transforming it into Algolia records for indexing.
-        The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor` property.
 
-        For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/recordextractor/).
+
+        The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor`.
+
+
+        For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor).
       properties:
         __type:
           $ref: '#/configurationRecordExtractorType'
@@ -110,7 +125,8 @@ ActionSchedule:
 fileTypes:
   type: string
   description: |
-    Supported file type for indexing non-HTML documents.
+    Supported file types for indexing non-HTML documents.
+
     
     For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/).
   enum:
@@ -129,6 +145,7 @@ urlPattern:
   description: |
     Pattern for matching URLs.
 
+
     Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
   example: https://www.algolia.com/**
 
@@ -140,7 +157,43 @@ hostnameAliases:
     Key-value pairs to replace matching hostnames found in a sitemap,
     on a page, in canonical links, or redirects.
 
-    For more information, see the [`hostnameAliases` documentation](https://www.algolia.com/doc/tools/crawler/apis/hostnamealiases/).
+
+    During a crawl, this action maps one hostname to another whenever the crawler encounters specific URLs.
+    This helps with links to staging environments (like `dev.example.com`) or external hosting services (such as YouTube).
+
+
+    For example, with this `hostnameAliases` mapping:
+
+        {
+        hostnameAliases: {
+            'dev.example.com': 'example.com'
+        }
+        }
+
+    1. The crawler encounters `https://dev.example.com/solutions/voice-search/`.
+
+    1. `hostnameAliases` transforms the URL to `https://example.com/solutions/voice-search/`.
+
+    1. The crawler follows the transformed URL (not the original).
+
+    
+    **`hostnameAliases` only changes URLs, not page text. In the preceding example, if the extracted text contains the string `dev.example.com`, it remains unchanged.**
+
+
+    The crawler can discover URLs in places such as:
+
+
+    - Crawled pages
+
+    - Sitemaps
+
+    - [Canonical URLs](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior)
+
+    - Redirects. 
+
+
+    However, `hostnameAliases` doesn't transform URLs you explicitly set in the `startUrls` or `sitemaps` parameters,
+    nor does it affect the `pathsToMatch` action or other configuration elements.
   additionalProperties:
     type: string
     description: Hostname that should be added in the records.
@@ -153,12 +206,16 @@ pathAliases:
       '/foo': '/bar'
   description: |
     Key-value pairs to replace matching paths with new values.
+
     
     It doesn't replace:
-      
+
+
     - URLs in the `startUrls`, `sitemaps`, `pathsToMatch`, and other settings.
+    
     - Paths found in extracted text.
 
+
     The crawl continues from the _transformed_ URLs.
   additionalProperties:
     type: object
@@ -172,9 +229,10 @@ pathAliases:
 cache:
   type: object
   description: |
-    Whether the crawler should cache crawled pages.
+        Whether the crawler should cache crawled pages.
+
 
-    For more information, see the [`cache` documentation](https://www.algolia.com/doc/tools/crawler/apis/cache/).
+        For more information, see [Partial crawls with caching](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#partial-crawls-with-caching).
   properties:
     enabled:
       type: boolean
diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml
index e87bfd6418..b95fc81c3f 100644
--- a/specs/crawler/common/schemas/configuration.yml
+++ b/specs/crawler/common/schemas/configuration.yml
@@ -16,9 +16,20 @@ Configuration:
     apiKey:
       type: string
       description: |
-        Algolia API key for indexing the records.
+        The Algolia API key the crawler uses for indexing records.
+        If you don't provide an API key, one will be generated by the Crawler when you create a configuration.
 
-        For more information, see the [`apiKey` documentation](https://www.algolia.com/doc/tools/crawler/apis/apikey/).
+
+        The API key must have:
+
+
+        - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`
+
+        - Access to the correct set of indices, based on the crawler's `indexPrefix`.
+        For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`)
+
+
+        **You can't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).**
     appId:
       $ref: '../parameters.yml#/applicationID'
     exclusionPatterns:
@@ -34,12 +45,14 @@ Configuration:
         description: |
           URLs to exclude from crawling.
 
+
           Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
     externalData:
       type: array
       description: |
         References to external data sources for enriching the extracted records.
 
+
         For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/).
       maxItems: 10
       items:
@@ -59,22 +72,44 @@ Configuration:
     ignoreNoFollowTo:
       type: boolean
       description: |
-        Whether to ignore the `nofollow` meta tag or link attribute.
+        Determines if the crawler should follow links with a `nofollow` directive.
+        If `true`, the crawler will ignore the `nofollow` directive and crawl links on the page.
+
+
+        The crawler always ignores links that don't match your [configuration settings](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#exclude-and-include-content).
+
+
+        `ignoreNoFollowTo` applies to:
+
 
-        For more information, see the [`ignoreNoFollowTo` documentation](https://www.algolia.com/doc/tools/crawler/apis/ignorenofollowto/).
+        - Links that are ignored because the [`robots` meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names) contains `nofollow` or `none`.
+
+        - Links with a [`rel` attribute](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel) containing the `nofollow` directive.
     ignoreNoIndex:
       type: boolean
       description: |
         Whether to ignore the `noindex` robots meta tag.
         If `true` pages with this meta tag _will_ be crawled.
+    ignorePaginationAttributes:
+      type: boolean
+      description: |
+        Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in your `<head>` HTML.
+
+
+        - If `true`, the crawler ignores the pagination links.
+
+        - If `false`, the crawler follows the pagination links.
+      default: true
     ignoreQueryParams:
       type: array
       description: |
         Query parameters to ignore while crawling.
 
+
         All URLs with the matching query parameters will be treated as identical.
         This prevents indexing URLs that just differ by their query parameters.
 
+
         You can use wildcard characters to pattern match.
       maxItems: 9999
       example:
@@ -96,7 +131,10 @@ Configuration:
       description: |
         Crawler index settings.
 
+
         These index settings are only applied during the first crawl of an index.
+
+
         Any subsequent changes won't be applied to the index.
         Instead, make changes to your index settings in the [Algolia dashboard](https://dashboard.algolia.com/explorer/configuration/).
       additionalProperties:
@@ -108,7 +146,8 @@ Configuration:
       description: |
         Function for extracting URLs from links on crawled pages.
 
-        For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/linkextractor/).
+
+        For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/link-extractor/).
       properties:
         __type:
           $ref: './action.yml#/configurationRecordExtractorType'
@@ -126,34 +165,85 @@ Configuration:
     login:
       $ref: '#/login'
     maxDepth:
-      type: number
+      type: integer
       description: |
-        Maximum path depth of crawled URLs.
-        For example, if `maxDepth` is 2, `https://example.com/foo/bar` is crawled,
-        but `https://example.com/foo/bar/baz` won't.
-        Trailing slashes increase the URL depth.
+        Determines the maximum path depth of crawled URLs.
+
+
+        Path depth is calculated based on the number of slash characters (`/`) after the domain (starting at 1).
+        For example:
+
+
+        **1** `http://example.com`
+
+
+        **1** `http://example.com/`
+
+
+        **1** `http://example.com/foo`
+
+
+        **2** `http://example.com/foo/`
+
+
+        **2** `http://example.com/foo/bar`
+
+
+        **3** `http://example.com/foo/bar/`
+
+
+        **URLs added with `startUrls` and `sitemaps` aren't checked for `maxDepth`.**
       minimum: 1
       maximum: 100
+      example: 5
     maxUrls:
-      type: number
+      type: integer
       description: |
         Limits the number of URLs your crawler processes.
 
+
         Change it to a low value, such as 100, for quick crawling tests.
+
+
         Change it to a higher explicit value for full crawls to prevent it from getting "lost" in complex site structures.
 
+
         Because the Crawler works on many pages simultaneously, `maxUrls` doesn't guarantee finding the same pages each time it runs.
       minimum: 1
       maximum: 15000000
+      example: 250
     rateLimit:
-      type: number
+      type: integer
       description: |
-        Number of concurrent tasks per second.
+        Determines the number of concurrent tasks per second that can run for this configuration.
+
+
+        A higher rate limit means more crawls per second.
+
+
+        Algolia prevents system overload by ensuring the number of URLs added in the last second and the number of URLs being processed is less than the rate limit:
+
+
+        ```
+        max(new_urls_added, active_urls_processing) <= rateLimit
+        ```
+
 
-        If processing each URL takes _n_ seconds,
-        your crawler can process `rateLimit / n` URLs per second.
+        Start with a low value (for example, 2) and increase it if you need faster crawling.
+        Be aware that a high `rateLimit` can have a huge impact on bandwidth cost and server resource consumption.
 
-        Higher numbers mean faster crawls but they also increase your bandwidth and server load.
+
+        The number of pages processed per second depends on the average time it takes to fetch, process, and upload a URL. 
+        For a given `rateLimit` if fetching, processing, and uploading URLs takes (on average):
+
+
+        - Less than a second, your crawler processes up to `rateLimit` pages per second.
+
+        - Four seconds, your crawler processes up to `rateLimit / 4` pages per second.
+
+
+        In the latter case, increasing `rateLimit` improves performance, up to a point. 
+        However, if the processing time remains at four seconds, increasing `rateLimit` won't increase the number of pages processed per second.
       minimum: 1
       maximum: 100
       example: 4
@@ -264,11 +354,11 @@ requestOptions:
       type: string
       description: Proxy for all crawler requests.
     timeout:
-      type: number
+      type: integer
       default: 30000
       description: Timeout in milliseconds for the crawl.
     retries:
-      type: number
+      type: integer
       default: 3
       description: Maximum number of retries to crawl one URL.
     headers:
@@ -279,12 +369,12 @@ waitTime:
   description: Timeout for the HTTP request.
   properties:
     min:
-      type: number
+      type: integer
       default: 0
       description: Minimum waiting time in milliseconds.
       example: 7000
     max:
-      type: number
+      type: integer
       default: 20000
       description: Maximum waiting time in milliseconds.
       example: 15000
@@ -312,13 +402,64 @@ headers:
       example: session=1234
 
 login:
-  description: Authorization method and credentials for crawling protected content.
+  description: >
+    Authorization method and credentials for crawling protected content.
+
+
+    The Crawler has several authentication methods for accessing protected content:
+
+
+    - **Basic authentication.** The Crawler obtains a session cookie from the login page.
+
+    - **OAuth 2.0 authentication** (`oauthRequest`). The Crawler uses OAuth 2.0 client credentials to obtain an access token for authentication.
+
+
+    **Basic authentication**
+
+
+    The Crawler extracts the `Set-Cookie` response header from the login page, stores that cookie, and sends it in the `Cookie` header when crawling all pages defined in the configuration.
+
+
+    This cookie is retrieved only at the start of each full crawl.
+    If it expires, it isn't automatically renewed.
+
+
+    The Crawler can obtain the session cookie in one of two ways:
+
+
+    - **HTTP request authentication** (`fetchRequest`). The Crawler sends a direct request with your credentials to the login endpoint, similar to a `curl` command.
+
+    - **Browser-based authentication** (`browserRequest`). The Crawler emulates a web browser by loading the login page, entering the credentials, and submitting the login form as a real user would.
+
+
+    **OAuth 2.0**
+    
+
+    The crawler supports [OAuth 2.0 client credentials grant flow](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4):
+
+
+    1. It performs an access token request with the provided credentials
+    
+    1. Stores the fetched token in an `Authorization` header
+    
+    1. Sends the token when crawling site pages.
+
+
+    > This token is only fetched at the beginning of each complete crawl.
+    If it expires, it isn't automatically renewed.
+
+
+    Client authentication passes the credentials (`client_id` and `client_secret`) [in the request body](https://datatracker.ietf.org/doc/html/rfc6749#section-2.3.1).
+
+
+    The [Azure AD v1.0](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) provider is supported.
   oneOf:
     - $ref: '#/fetchRequest'
     - $ref: '#/browserRequest'
     - $ref: '#/oauthRequest'
 
 fetchRequest:
+  title: HTTP request
   type: object
   description: Information for making a HTTP request for authorization.
   properties:
@@ -330,8 +471,17 @@ fetchRequest:
       $ref: '#/loginRequestOptions'
   required:
     - url
+  example:
+    url: "https://example.com/secure/login-with-post"
+    requestOptions:
+      method: "POST"
+      headers:
+        Content-Type: "application/x-www-form-urlencoded"
+      body: "id=my-id&password=my-password"
+      timeout: 5000
 
 browserRequest:
+  title: Browser-based
   type: object
   description: |
     Information for using a web browser for authorization.
@@ -342,6 +492,7 @@ browserRequest:
       description: |
         URL of your login page.
 
+
         The crawler looks for an input matching the selector `input[type=text]` or `input[type=email]` for the username and `input[type=password]` for the password.
       example: https://example.com/login
     username:
@@ -358,17 +509,32 @@ browserRequest:
     - url
     - username
     - password
+  example:
+    url: "https://example.com/secure/login-page"
+    username: "my-id"
+    password: "my-password"
 
 oauthRequest:
+  title: OAuth 2.0
   type: object
   description: |
     Authorization information for using the [OAuth 2.0 client credentials](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4) authorization grant.
+
+
     OAuth authorization is supported for [Azure Active Directory version 1](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) as provider.
   properties:
     accessTokenRequest:
       $ref: '#/accessTokenRequest'
   required:
     - accessTokenRequest
+  example:
+    accessTokenRequest:
+      url: "https://example.com/oauth2/token"
+      grant_type: "client_credentials"
+      client_id: "my-client-id"
+      client_secret: "my-client-secret"
+      extraParameters:
+        resource: "https://protected.example.com/"
 
 loginRequestOptions:
   type: object
@@ -386,7 +552,7 @@ loginRequestOptions:
       description: Form content.
       example: 'id=user&password=s3cr3t'
     timeout:
-      type: number
+      type: integer
       description: Timeout for the request.
 
 accessTokenRequest:
@@ -433,13 +599,14 @@ extraParameters:
       description: |
         App ID URI of the receiving web service.
 
-        For more information, see [Azure Active Directory](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow#first-case-access-token-request-with-a-shared-secret).
 
+        For more information, see [Azure Active Directory](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow#first-case-access-token-request-with-a-shared-secret).
 safetyChecks:
   type: object
   description: |
     Checks to ensure the crawl was successful.
 
+
     For more information, see the [Safety checks](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#safety-checks) documentation.
   properties:
     beforeIndexPublishing:
@@ -450,13 +617,13 @@ beforeIndexPublishing:
   description: Checks triggered after the crawl finishes but before the records are added to the Algolia index.
   properties:
     maxLostRecordsPercentage:
-      type: number
+      type: integer
       description: Maximum difference in percent between the numbers of records between crawls.
       minimum: 1
       maximum: 100
       default: 10
     maxFailedUrls:
-      type: number
+      type: integer
       description: Stops the crawler if a specified number of pages fail to crawl.
 
 schedule:
@@ -464,5 +631,25 @@ schedule:
   description: |
     Schedule for running the crawl.
 
-    For more information, see the [`schedule` documentation](https://www.algolia.com/doc/tools/crawler/apis/schedule/).
+
+    Instead of manually starting a crawl each time, you can set up a schedule for automatic crawls.
+
+
+    [Use the visual UI](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration-visual/) or add the `schedule` parameter to [your configuration](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/).
+
+
+    `schedule` uses [Later.js syntax](https://bunkat.github.io/later/) to specify when to crawl your site.
+    Here are some key things to keep in mind when using `Later.js` syntax with the Crawler:
+
+    - The interval between two scheduled crawls must be at least 24 hours.
+
+    - To crawl daily, use "every 1 day" instead of "everyday" or "every day".
+
+    - If you don't specify a time, the crawl can happen any time during the scheduled day.
+
+    - Specify times for the UTC (GMT+0) timezone
+
+    - Include minutes when specifying a time. For example, "at 3:00 pm" instead of "at 3pm".
+
+    - Use "at 12:00 am" to specify midnight, not "at 00:00 am".
   example: every weekday at 12:00 pm
diff --git a/specs/crawler/spec.yml b/specs/crawler/spec.yml
index d6b69e9966..1af804a0bb 100644
--- a/specs/crawler/spec.yml
+++ b/specs/crawler/spec.yml
@@ -13,14 +13,12 @@ info:
     **All requests must use HTTPS.**
 
     ## Availability and authentication
-
-    Access to the Crawler API is available with the [Crawler add-on](https://www.algolia.com/pricing/).
-
+    
     To authenticate your API requests, use the **basic authentication** header:
 
     - `Authorization: Basic <credentials>`
 
-    where `<credentials>` is a base64-encoded string `<user-id>:<api-key>`.
+    Where `<credentials>` is a base64-encoded string `<user-id>:<api-key>`.
 
     - `<user-id>`. The Crawler user ID.
     - `<api-key>`. The Crawler API key.

From 103ffe0dd71675d7e67b17db65e3877e33368017 Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 10:00:10 +0000
Subject: [PATCH 2/9] Example for pathAliases

---
 specs/crawler/common/schemas/action.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml
index f13a34380d..7571815aad 100644
--- a/specs/crawler/common/schemas/action.yml
+++ b/specs/crawler/common/schemas/action.yml
@@ -32,7 +32,7 @@ Action:
         It functions similarly to the `pathsToMatch` action but without record extraction.
 
 
-        `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards, negation, and other features. 
+        Uses [micromatch](https://github.com/micromatch/micromatch) to match wildcards, negation, and other features. 
         The crawler adds all matching URLs to its queue.
       items:
         $ref: '#/urlPattern'
@@ -217,6 +217,14 @@ pathAliases:
 
 
     The crawl continues from the _transformed_ URLs.
+    
+
+    For example, if you create a mapping for `{ "dev.example.com": { '/foo': '/bar' } }` and the crawler encounters `https://dev.example.com/foo/hello/`,
+    it’s transformed to `https://dev.example.com/bar/hello/`.
+    
+
+    > Compare with the `hostnameAliases` action.
+
   additionalProperties:
     type: object
     description: Hostname for which matching paths should be replaced.

From 17bbe04c546a5c14c1e8594e4821c04178e14496 Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 10:10:17 +0000
Subject: [PATCH 3/9] Typos

---
 specs/crawler/common/schemas/configuration.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml
index b95fc81c3f..7c037c5bf6 100644
--- a/specs/crawler/common/schemas/configuration.yml
+++ b/specs/crawler/common/schemas/configuration.yml
@@ -23,13 +23,14 @@ Configuration:
         The API key must have:
 
 
-        - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`
+        - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions):
+        `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`.
 
         - Access to the correct set of indices, based on the crawler's `indexPrefix`.
-        For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`)
+        For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`.
 
 
-        **You can't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).**
+        **Don't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).**
     appId:
       $ref: '../parameters.yml#/applicationID'
     exclusionPatterns:
@@ -287,9 +288,9 @@ ignoreCanonicalTo:
   oneOf:
     - type: boolean
       description: |
-        Determines if the crawler should extract records from a page with a [canonical URL](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behaviorr).
+        Determines if the crawler should extract records from a page with a [canonical URL](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior).
 
-        If ignoreCanonicalTo is set to:
+        If `ignoreCanonicalTo` is set to:
 
         - `true` all canonical URLs are ignored.
         - One or more URL patterns, the crawler will ignore the canonical URL if it matches a pattern.

From c20759321024fe75098c7005fc48667048a141c2 Mon Sep 17 00:00:00 2001
From: Kai Welke <kai.welke@algolia.com>
Date: Wed, 19 Mar 2025 11:10:01 +0100
Subject: [PATCH 4/9] fix: whitespace

---
 specs/crawler/common/schemas/action.yml       |  74 +--------
 .../crawler/common/schemas/configuration.yml  | 140 +++++-------------
 2 files changed, 43 insertions(+), 171 deletions(-)

diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml
index 7571815aad..91f66ba041 100644
--- a/specs/crawler/common/schemas/action.yml
+++ b/specs/crawler/common/schemas/action.yml
@@ -3,14 +3,10 @@ Action:
   description: |
     How to process crawled URLs.
 
-
     Each action defines:
 
-
     - The targeted subset of URLs it processes.
-
     - What information to extract from the web pages.
-
     - The Algolia indices where the extracted records will be stored.
 
     If a single web page matches several actions,
@@ -26,14 +22,12 @@ Action:
       type: array
       description: |
         Which _intermediary_ web pages the crawler should visit.
-        Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages, _not_ their content.
-
-
+        Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages,
+        _not_ their content.
         It functions similarly to the `pathsToMatch` action but without record extraction.
 
-
-        Uses [micromatch](https://github.com/micromatch/micromatch) to match wildcards, negation, and other features. 
-        The crawler adds all matching URLs to its queue.
+        `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards,
+        negation, and other features.
       items:
         $ref: '#/urlPattern'
     fileTypesToMatch:
@@ -41,7 +35,6 @@ Action:
       description: |
         File types for crawling non-HTML documents.
 
-
         For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/).
       maxItems: 100
       items:
@@ -70,7 +63,6 @@ Action:
       description: |
         URLs to which this action should apply.
 
-
         Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
       minItems: 1
       maxItems: 100
@@ -82,11 +74,8 @@ Action:
       description: |
         Function for extracting information from a crawled page and transforming it into Algolia records for indexing.
 
-
         The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor`.
-
-
-        For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor).
+        For details, see the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor).
       properties:
         __type:
           $ref: '#/configurationRecordExtractorType'
@@ -127,7 +116,6 @@ fileTypes:
   description: |
     Supported file types for indexing non-HTML documents.
 
-    
     For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/).
   enum:
     - doc
@@ -145,7 +133,6 @@ urlPattern:
   description: |
     Pattern for matching URLs.
 
-
     Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
   example: https://www.algolia.com/**
 
@@ -153,47 +140,7 @@ hostnameAliases:
   type: object
   example:
     'dev.example.com': 'example.com'
-  description: |
-    Key-value pairs to replace matching hostnames found in a sitemap,
-    on a page, in canonical links, or redirects.
-
-
-    During a crawl, this action maps one hostname to another whenever the crawler encounters specific URLs.
-    This helps with links to staging environments (like `dev.example.com`) or external hosting services (such as YouTube).
-
-
-    For example, with this `hostnameAliases` mapping:
-
-        {
-        hostnameAliases: {
-            'dev.example.com': 'example.com'
-        }
-        }
-
-    1. The crawler encounters `https://dev.example.com/solutions/voice-search/`.
-
-    1. `hostnameAliases` transforms the URL to `https://example.com/solutions/voice-search/`.
-
-    1. The crawler follows the transformed URL (not the original).
-
-    
-    **`hostnameAliases` only changes URLs, not page text. In the preceding example, if the extracted text contains the string `dev.example.com`, it remains unchanged.**
-
-
-    The crawler can discover URLs in places such as:
-
-
-    - Crawled pages
-
-    - Sitemaps
-
-    - [Canonical URLs](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior)
-
-    - Redirects. 
-
-
-    However, `hostnameAliases` doesn't transform URLs you explicitly set in the `startUrls` or `sitemaps` parameters,
-    nor does it affect the `pathsToMatch` action or other configuration elements.
+  description: "Key-value pairs to replace matching hostnames found in a sitemap,\non a page, in canonical links, or redirects.\n\n\nDuring a crawl, this action maps one hostname to another whenever the crawler encounters specific URLs.\nThis helps with links to staging environments (like `dev.example.com`) or external hosting services (such as YouTube).\n\n\nFor example, with this `hostnameAliases` mapping:\n\n    {\n    hostnameAliases: {\n        'dev.example.com': 'example.com'\n    }\n    }\n\n1. The crawler encounters `https://dev.example.com/solutions/voice-search/`.\n\n1. `hostnameAliases` transforms the URL to `https://example.com/solutions/voice-search/`.\n\n1. The crawler follows the transformed URL (not the original).\n\n\n**`hostnameAliases` only changes URLs, not page text. In the preceding example, if the extracted text contains the string `dev.example.com`, it remains unchanged.**\n\n\nThe crawler can discover URLs in places such as:\n\n\n- Crawled pages\n\n- Sitemaps\n\n- [Canonical URLs](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behavior)\n\n- Redirects. \n\n\nHowever, `hostnameAliases` doesn't transform URLs you explicitly set in the `startUrls` or `sitemaps` parameters,\nnor does it affect the `pathsToMatch` action or other configuration elements.\n"
   additionalProperties:
     type: string
     description: Hostname that should be added in the records.
@@ -207,15 +154,11 @@ pathAliases:
   description: |
     Key-value pairs to replace matching paths with new values.
 
-    
     It doesn't replace:
 
-
     - URLs in the `startUrls`, `sitemaps`, `pathsToMatch`, and other settings.
-    
     - Paths found in extracted text.
 
-
     The crawl continues from the _transformed_ URLs.
     
 
@@ -237,10 +180,9 @@ pathAliases:
 cache:
   type: object
   description: |
-        Whether the crawler should cache crawled pages.
-
+    Whether the crawler should cache crawled pages.
 
-        For more information, see [Partial crawls with caching](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#partial-crawls-with-caching).
+    For more information, see [Partial crawls with caching](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#partial-crawls-with-caching).
   properties:
     enabled:
       type: boolean
diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml
index 7c037c5bf6..e4c978a546 100644
--- a/specs/crawler/common/schemas/configuration.yml
+++ b/specs/crawler/common/schemas/configuration.yml
@@ -19,18 +19,13 @@ Configuration:
         The Algolia API key the crawler uses for indexing records.
         If you don't provide an API key, one will be generated by the Crawler when you create a configuration.
 
-
         The API key must have:
 
-
-        - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions):
-        `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`.
-
+        - These [rights and restrictions](https://www.algolia.com/doc/guides/security/api-keys/#rights-and-restrictions): `search`, `addObject`, `deleteObject`, `deleteIndex`, `settings`, `editSettings`, `listIndexes`, `browse`
         - Access to the correct set of indices, based on the crawler's `indexPrefix`.
         For example, if the prefix is `crawler_`, the API key must have access to `crawler_*`.
 
-
-        **Don't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys).**
+        **Don't use your [Admin API key](https://www.algolia.com/doc/guides/security/api-keys/#predefined-api-keys)**.
     appId:
       $ref: '../parameters.yml#/applicationID'
     exclusionPatterns:
@@ -46,14 +41,12 @@ Configuration:
         description: |
           URLs to exclude from crawling.
 
-
           Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
     externalData:
       type: array
       description: |
         References to external data sources for enriching the extracted records.
 
-
         For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/).
       maxItems: 10
       items:
@@ -76,15 +69,10 @@ Configuration:
         Determines if the crawler should follow links with a `nofollow` directive.
         If `true`, the crawler will ignore the `nofollow` directive and crawl links on the page.
 
-
         The crawler always ignores links that don't match your [configuration settings](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#exclude-and-include-content).
-
-
         `ignoreNoFollowTo` applies to:
 
-
         - Links that are ignored because the [`robots` meta tag](https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta/name#Other_metadata_names) contains `nofollow` or `none`.
-
         - Links with a [`rel` attribute](https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel) containing the `nofollow` directive.
     ignoreNoIndex:
       type: boolean
@@ -96,9 +84,7 @@ Configuration:
       description: |
         Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in your `<head>` HTML.
 
-
         - If `true`, the crawler ignores the pagination links.
-
         - If `false`, the crawler follows the pagination links.
       default: true
     ignoreQueryParams:
@@ -106,11 +92,9 @@ Configuration:
       description: |
         Query parameters to ignore while crawling.
 
-
         All URLs with the matching query parameters will be treated as identical.
         This prevents indexing URLs that just differ by their query parameters.
 
-
         You can use wildcard characters to pattern match.
       maxItems: 9999
       example:
@@ -132,10 +116,8 @@ Configuration:
       description: |
         Crawler index settings.
 
-
         These index settings are only applied during the first crawl of an index.
 
-
         Any subsequent changes won't be applied to the index.
         Instead, make changes to your index settings in the [Algolia dashboard](https://dashboard.algolia.com/explorer/configuration/).
       additionalProperties:
@@ -147,7 +129,6 @@ Configuration:
       description: |
         Function for extracting URLs from links on crawled pages.
 
-
         For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/link-extractor/).
       properties:
         __type:
@@ -170,30 +151,17 @@ Configuration:
       description: |
         Determines the maximum path depth of crawled URLs.
 
-
         Path depth is calculated based on the number of slash characters (`/`) after the domain (starting at 1).
         For example:
 
+        - **1** `http://example.com`
+        - **1** `http://example.com/`
+        - **1** `http://example.com/foo`
+        - **2** `http://example.com/foo/`
+        - **2** `http://example.com/foo/bar`
+        - **3** `http://example.com/foo/bar/`
 
-        **1** `http://example.com`
-
-
-        **1** `http://example.com/`
-
-
-        **1** `http://example.com/foo`
-
-
-        **2** `http://example.com/foo/`
-
-
-        **2** `http://example.com/foo/bar`
-
-
-        **3** `http://example.com/foo/bar/`
-
-
-        **URLs added with `startUrls` and `sitemaps` aren't checked for `maxDepth`.**
+        **URLs added with `startUrls` and `sitemaps` aren't checked for `maxDepth`.**.
       minimum: 1
       maximum: 100
       example: 5
@@ -202,13 +170,8 @@ Configuration:
       description: |
         Limits the number of URLs your crawler processes.
 
-
         Change it to a low value, such as 100, for quick crawling tests.
-
-
         Change it to a higher explicit value for full crawls to prevent it from getting "lost" in complex site structures.
-
-
         Because the Crawler works on many pages simultaneously, `maxUrls` doesn't guarantee finding the same pages each time it runs.
       minimum: 1
       maximum: 15000000
@@ -218,10 +181,7 @@ Configuration:
       description: |
         Determines the number of concurrent tasks per second that can run for this configuration.
 
-
         A higher rate limit means more crawls per second.
-
-
         Algolia prevents system overload by ensuring the number of URLs added in the last second and the number of URLs being processed is less than the rate limit:
 
 
@@ -229,20 +189,15 @@ Configuration:
         max(new_urls_added, active_urls_processing) <= rateLimit
         ```
 
-
         Start with a low value (for example, 2) and increase it if you need faster crawling.
         Be aware that a high `rateLimit` can have a huge impact on bandwidth cost and server resource consumption.
 
-
         The number of pages processed per second depends on the average time it takes to fetch, process, and upload a URL. 
         For a given `rateLimit` if fetching, processing, and uploading URLs takes (on average):
 
-
         - Less than a second, your crawler processes up to `rateLimit` pages per second.
-
         - Four seconds, your crawler processes up to `rateLimit / 4` pages per second.
 
-
         In the latter case, increasing `rateLimit` improves performance, up to a point. 
         However, if the processing time remains at four seconds, increasing `rateLimit` won't increase the number of pages processed per second.
       minimum: 1
@@ -403,56 +358,44 @@ headers:
       example: session=1234
 
 login:
-  description: >
+  description: |
     Authorization method and credentials for crawling protected content.
 
+    The Crawler supports these authentication methods:
 
-    The Crawler has several authentication methods for accessing protected content:
-
-
-    - **Basic authentication.** The Crawler obtains a session cookie from the login page.
-
-    - **OAuth 2.0 authentication** (`oauthRequest`). The Crawler uses OAuth 2.0 client credentials to obtain an access token for authentication.
-
+    - **Basic authentication**.
+      The Crawler obtains a session cookie from the login page.
+    - **OAuth 2.0 authentication** (`oauthRequest`).
+      The Crawler uses OAuth 2.0 client credentials to obtain an access token for authentication.
 
     **Basic authentication**
 
-
-    The Crawler extracts the `Set-Cookie` response header from the login page, stores that cookie, and sends it in the `Cookie` header when crawling all pages defined in the configuration.
-
+    The Crawler extracts the `Set-Cookie` response header from the login page, stores that cookie,
+    and sends it in the `Cookie` header when crawling all pages defined in the configuration.
 
     This cookie is retrieved only at the start of each full crawl.
     If it expires, it isn't automatically renewed.
 
-
     The Crawler can obtain the session cookie in one of two ways:
 
-
-    - **HTTP request authentication** (`fetchRequest`). The Crawler sends a direct request with your credentials to the login endpoint, similar to a `curl` command.
-
-    - **Browser-based authentication** (`browserRequest`). The Crawler emulates a web browser by loading the login page, entering the credentials, and submitting the login form as a real user would.
-
+    - **HTTP request authentication** (`fetchRequest`).
+      The Crawler sends a direct request with your credentials to the login endpoint, similar to a `curl` command.
+    - **Browser-based authentication** (`browserRequest`).
+      The Crawler emulates a web browser by loading the login page, entering the credentials,
+      and submitting the login form as a real user would.
 
     **OAuth 2.0**
-    
 
     The crawler supports [OAuth 2.0 client credentials grant flow](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4):
 
-
     1. It performs an access token request with the provided credentials
-    
     1. Stores the fetched token in an `Authorization` header
-    
     1. Sends the token when crawling site pages.
 
-
-    > This token is only fetched at the beginning of each complete crawl.
+    This token is only fetched at the beginning of each complete crawl.
     If it expires, it isn't automatically renewed.
 
-
     Client authentication passes the credentials (`client_id` and `client_secret`) [in the request body](https://datatracker.ietf.org/doc/html/rfc6749#section-2.3.1).
-
-
     The [Azure AD v1.0](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) provider is supported.
   oneOf:
     - $ref: '#/fetchRequest'
@@ -473,12 +416,12 @@ fetchRequest:
   required:
     - url
   example:
-    url: "https://example.com/secure/login-with-post"
+    url: 'https://example.com/secure/login-with-post'
     requestOptions:
-      method: "POST"
+      method: 'POST'
       headers:
-        Content-Type: "application/x-www-form-urlencoded"
-      body: "id=my-id&password=my-password"
+        Content-Type: 'application/x-www-form-urlencoded'
+      body: 'id=my-id&password=my-password'
       timeout: 5000
 
 browserRequest:
@@ -493,7 +436,6 @@ browserRequest:
       description: |
         URL of your login page.
 
-
         The crawler looks for an input matching the selector `input[type=text]` or `input[type=email]` for the username and `input[type=password]` for the password.
       example: https://example.com/login
     username:
@@ -511,9 +453,9 @@ browserRequest:
     - username
     - password
   example:
-    url: "https://example.com/secure/login-page"
-    username: "my-id"
-    password: "my-password"
+    url: 'https://example.com/secure/login-page'
+    username: 'my-id'
+    password: 'my-password'
 
 oauthRequest:
   title: OAuth 2.0
@@ -521,7 +463,6 @@ oauthRequest:
   description: |
     Authorization information for using the [OAuth 2.0 client credentials](https://datatracker.ietf.org/doc/html/rfc6749#section-4.4) authorization grant.
 
-
     OAuth authorization is supported for [Azure Active Directory version 1](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow) as provider.
   properties:
     accessTokenRequest:
@@ -530,12 +471,12 @@ oauthRequest:
     - accessTokenRequest
   example:
     accessTokenRequest:
-      url: "https://example.com/oauth2/token"
-      grant_type: "client_credentials"
-      client_id: "my-client-id"
-      client_secret: "my-client-secret"
+      url: 'https://example.com/oauth2/token'
+      grant_type: 'client_credentials'
+      client_id: 'my-client-id'
+      client_secret: 'my-client-secret'
       extraParameters:
-        resource: "https://protected.example.com/"
+        resource: 'https://protected.example.com/'
 
 loginRequestOptions:
   type: object
@@ -600,14 +541,12 @@ extraParameters:
       description: |
         App ID URI of the receiving web service.
 
-
         For more information, see [Azure Active Directory](https://learn.microsoft.com/en-us/previous-versions/azure/active-directory/azuread-dev/v1-oauth2-client-creds-grant-flow#first-case-access-token-request-with-a-shared-secret).
 safetyChecks:
   type: object
   description: |
     Checks to ensure the crawl was successful.
 
-
     For more information, see the [Safety checks](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#safety-checks) documentation.
   properties:
     beforeIndexPublishing:
@@ -632,25 +571,16 @@ schedule:
   description: |
     Schedule for running the crawl.
 
-
     Instead of manually starting a crawl each time, you can set up a schedule for automatic crawls.
-
-
     [Use the visual UI](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration-visual/) or add the `schedule` parameter to [your configuration](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/).
 
-
     `schedule` uses [Later.js syntax](https://bunkat.github.io/later/) to specify when to crawl your site.
     Here are some key things to keep in mind when using `Later.js` syntax with the Crawler:
 
     - The interval between two scheduled crawls must be at least 24 hours.
-
     - To crawl daily, use "every 1 day" instead of "everyday" or "every day".
-
     - If you don't specify a time, the crawl can happen any time during the scheduled day.
-
     - Specify times for the UTC (GMT+0) timezone
-
     - Include minutes when specifying a time. For example, "at 3:00 pm" instead of "at 3pm".
-
     - Use "at 12:00 am" to specify midnight, not "at 00:00 am".
   example: every weekday at 12:00 pm

From 5aab7435f54582742eab2567dff28773ddc08dd1 Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 10:15:03 +0000
Subject: [PATCH 5/9] Remove "add-on" statement

---
 specs/crawler/common/parameters.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/specs/crawler/common/parameters.yml b/specs/crawler/common/parameters.yml
index 4bf171bbdc..9b43886129 100644
--- a/specs/crawler/common/parameters.yml
+++ b/specs/crawler/common/parameters.yml
@@ -54,7 +54,6 @@ applicationID:
   type: string
   description: |
     Algolia application ID where the crawler creates and updates indices.
-    The Crawler add-on must be enabled for this application.
 
 CrawlerID:
   type: string

From d814102f9587770c9cb206d0caa361d297a65de0 Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 11:37:03 +0000
Subject: [PATCH 6/9] Clarifications

---
 specs/crawler/common/schemas/configuration.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml
index e4c978a546..a5cb1bb760 100644
--- a/specs/crawler/common/schemas/configuration.yml
+++ b/specs/crawler/common/schemas/configuration.yml
@@ -78,11 +78,11 @@ Configuration:
       type: boolean
       description: |
         Whether to ignore the `noindex` robots meta tag.
-        If `true` pages with this meta tag _will_ be crawled.
+        If `true`, pages with this meta tag _will_ be crawled.
     ignorePaginationAttributes:
       type: boolean
       description: |
-        Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in your `<head>` HTML.
+        Whether the crawler should follow `rel="prev"` and `rel="next"` pagination links in the `<head>` section of an HTML page.
 
         - If `true`, the crawler ignores the pagination links.
         - If `false`, the crawler follows the pagination links.
@@ -92,10 +92,10 @@ Configuration:
       description: |
         Query parameters to ignore while crawling.
 
-        All URLs with the matching query parameters will be treated as identical.
+        All URLs with the matching query parameters are treated as identical.
         This prevents indexing URLs that just differ by their query parameters.
 
-        You can use wildcard characters to pattern match.
+        Use wildcards to match multiple query parameters.
       maxItems: 9999
       example:
         - ref

From 797677f0d3d68c63481c35b06809ba1bd411b8e0 Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 11:58:30 +0000
Subject: [PATCH 7/9] Remove duplicated text from actions

---
 specs/crawler/common/schemas/action.yml | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml
index 91f66ba041..e0b9583323 100644
--- a/specs/crawler/common/schemas/action.yml
+++ b/specs/crawler/common/schemas/action.yml
@@ -25,17 +25,12 @@ Action:
         Use `discoveryPatterns` to define pages that should be visited _just_ for their links to other pages,
         _not_ their content.
         It functions similarly to the `pathsToMatch` action but without record extraction.
-
-        `discoveryPatterns` uses [micromatch](https://github.com/micromatch/micromatch) to support matching with wildcards,
-        negation, and other features.
       items:
         $ref: '#/urlPattern'
     fileTypesToMatch:
       type: array
       description: |
         File types for crawling non-HTML documents.
-
-        For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/).
       maxItems: 100
       items:
         $ref: '#/fileTypes'
@@ -91,7 +86,8 @@ Action:
       maxItems: 100
       items:
         type: string
-        description: DOM selector. Negation is supported. This lets you ignore pages that match the selector.
+        description: |
+          Prefix a selector with `!` to ignore matching pages. 
       example:
         - .products
         - '!.featured'
@@ -114,8 +110,6 @@ ActionSchedule:
 fileTypes:
   type: string
   description: |
-    Supported file types for indexing non-HTML documents.
-
     For more information, see [Extract data from non-HTML documents](https://www.algolia.com/doc/tools/crawler/extracting-data/non-html-documents/).
   enum:
     - doc
@@ -131,9 +125,7 @@ fileTypes:
 urlPattern:
   type: string
   description: |
-    Pattern for matching URLs.
-
-    Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
+    Use [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
   example: https://www.algolia.com/**
 
 hostnameAliases:

From 6ccfffc1511e053875c64ba5fc6e996ea579a100 Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 12:05:58 +0000
Subject: [PATCH 8/9] Removed duplicate text from configuration parameters

---
 specs/crawler/common/schemas/configuration.yml | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml
index a5cb1bb760..2380a7fc6d 100644
--- a/specs/crawler/common/schemas/configuration.yml
+++ b/specs/crawler/common/schemas/configuration.yml
@@ -39,19 +39,15 @@ Configuration:
       items:
         type: string
         description: |
-          URLs to exclude from crawling.
-
-          Uses [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
+          Use [micromatch](https://github.com/micromatch/micromatch) for negation, wildcards, and more.
     externalData:
       type: array
       description: |
         References to external data sources for enriching the extracted records.
-
-        For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/).
       maxItems: 10
       items:
         type: string
-        description: Reference to an external data source you configured in the Crawler dashboard.
+        description: For more information, see [Enrich extracted records with external data](https://www.algolia.com/doc/tools/crawler/guides/enriching-extraction-with-external-data/).
         example: testCSV
     extraUrls:
       type: array
@@ -94,15 +90,13 @@ Configuration:
 
         All URLs with the matching query parameters are treated as identical.
         This prevents indexing URLs that just differ by their query parameters.
-
-        Use wildcards to match multiple query parameters.
       maxItems: 9999
       example:
         - ref
         - utm_*
       items:
         type: string
-        description: Query parameter to ignore. You can include wildcards to match a range of similar query parameters.
+        description: Use wildcards to match multiple query parameters.
     ignoreRobotsTxtRules:
       type: boolean
       description: Whether to ignore rules defined in your `robots.txt` file.

From 44b852a9d8eb418d51022289c1bb40ab47c1746c Mon Sep 17 00:00:00 2001
From: gazconroy <gazconroyster@gmail.com>
Date: Wed, 19 Mar 2025 13:33:28 +0000
Subject: [PATCH 9/9] Added schedule action

---
 specs/crawler/common/schemas/action.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml
index e0b9583323..00206d3924 100644
--- a/specs/crawler/common/schemas/action.yml
+++ b/specs/crawler/common/schemas/action.yml
@@ -78,6 +78,12 @@ Action:
           type: string
           description: |
             A JavaScript function (as a string) that returns one or more Algolia records for each crawled page.
+    schedule:
+      type: string
+      description: |
+        How often to perform a complete crawl for this action.
+        
+        For mopre information, consult the [`schedule` parameter documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/schedule/).
     selectorsToMatch:
       type: array
       description: |