Return 429 instead of 503 when all the shards of an index are rate limited (#5651)

guilload · web-flow · commit 99d2c47ea21c · 2025-01-27T17:17:47.000-05:00
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -49,7 +49,7 @@ use super::debouncing::{
 };
 use super::ingester::PERSIST_REQUEST_TIMEOUT;
 use super::metrics::IngestResultMetrics;
-use super::routing_table::RoutingTable;
+use super::routing_table::{NextOpenShardError, RoutingTable};
 use super::workbench::IngestWorkbench;
 use super::{pending_subrequests, IngesterPool};
 use crate::{get_ingest_router_buffer_size, LeaderId};
@@ -287,6 +287,7 @@ impl IngestRouter {
                     }
                     for persist_failure in persist_response.failures {
                         workbench.record_persist_failure(&persist_failure);
+
                         match persist_failure.reason() {
                             PersistFailureReason::ShardClosed => {
                                 let shard_id = persist_failure.shard_id().clone();
@@ -314,7 +315,7 @@ impl IngestRouter {
                                 // That way we will avoid to retry the persist request on the very
                                 // same node.
                                 let shard_id = persist_failure.shard_id().clone();
-                                workbench.rate_limited_shard.insert(shard_id);
+                                workbench.rate_limited_shards.insert(shard_id);
                             }
                             _ => {}
                         }
@@ -363,39 +364,44 @@ impl IngestRouter {
         self.populate_routing_table_debounced(workbench, debounced_request)
             .await;
 
-        // List of subrequest IDs for which no shards are available to route the subrequests to.
-        let mut no_shards_available_subrequest_ids = Vec::new();
+        // Subrequests for which no shards are available to route the subrequests to.
+        let mut no_shards_available_subrequest_ids: Vec<SubrequestId> = Vec::new();
+        // Subrequests for which the shards are rate limited.
+        let mut rate_limited_subrequest_ids: Vec<SubrequestId> = Vec::new();
 
         let mut per_leader_persist_subrequests: HashMap<&LeaderId, Vec<PersistSubrequest>> =
             HashMap::new();
 
+        let rate_limited_shards: &HashSet<ShardId> = &workbench.rate_limited_shards;
         let state_guard = self.state.lock().await;
 
-        // TODO: Here would be the most optimal place to split the body of the HTTP request into
-        // lines, validate, transform and then pack the docs into compressed batches routed
-        // to the right shards.
-
-        let rate_limited_shards: &HashSet<ShardId> = &workbench.rate_limited_shard;
         for subrequest in pending_subrequests(&workbench.subworkbenches) {
-            let Some(shard) = state_guard
+            let next_open_shard_res_opt = state_guard
                 .routing_table
                 .find_entry(&subrequest.index_id, &subrequest.source_id)
-                .and_then(|entry| {
+                .map(|entry| {
                     entry.next_open_shard_round_robin(&self.ingester_pool, rate_limited_shards)
-                })
-            else {
-                no_shards_available_subrequest_ids.push(subrequest.subrequest_id);
-                continue;
+                });
+            let next_open_shard = match next_open_shard_res_opt {
+                Some(Ok(next_open_shard)) => next_open_shard,
+                Some(Err(NextOpenShardError::RateLimited)) => {
+                    rate_limited_subrequest_ids.push(subrequest.subrequest_id);
+                    continue;
+                }
+                Some(Err(NextOpenShardError::NoShardsAvailable)) | None => {
+                    no_shards_available_subrequest_ids.push(subrequest.subrequest_id);
+                    continue;
+                }
             };
             let persist_subrequest = PersistSubrequest {
                 subrequest_id: subrequest.subrequest_id,
-                index_uid: shard.index_uid.clone().into(),
-                source_id: shard.source_id.clone(),
-                shard_id: Some(shard.shard_id.clone()),
+                index_uid: next_open_shard.index_uid.clone().into(),
+                source_id: next_open_shard.source_id.clone(),
+                shard_id: Some(next_open_shard.shard_id.clone()),
                 doc_batch: subrequest.doc_batch.clone(),
             };
             per_leader_persist_subrequests
-                .entry(&shard.leader_id)
+                .entry(&next_open_shard.leader_id)
                 .or_default()
                 .push(persist_subrequest);
         }
@@ -421,6 +427,7 @@ impl IngestRouter {
                 commit_type: commit_type as i32,
             };
             workbench.record_persist_request(&persist_request);
+
             let persist_future = async move {
                 let persist_result = tokio::time::timeout(
                     PERSIST_REQUEST_TIMEOUT,
@@ -443,6 +450,9 @@ impl IngestRouter {
         for subrequest_id in no_shards_available_subrequest_ids {
             workbench.record_no_shards_available(subrequest_id);
         }
+        for subrequest_id in rate_limited_subrequest_ids {
+            workbench.record_rate_limited(subrequest_id);
+        }
         self.process_persist_results(workbench, persist_futures)
             .await;
     }
@@ -610,7 +620,6 @@ impl IngestRouterService for IngestRouter {
                 .retry_batch_persist(ingest_request, MAX_PERSIST_ATTEMPTS)
                 .await)
         };
-
         update_ingest_metrics(&ingest_res, num_subrequests);
 
         ingest_res
@@ -1916,7 +1925,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_do_not_retry_rate_limited_shards() {
+    async fn test_router_does_not_retry_rate_limited_shards() {
         // We avoid retrying a shard limited shard at the scale of a workbench.
         let self_node_id = "test-router".into();
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
@@ -2075,4 +2084,86 @@ mod tests {
         };
         router.ingest(ingest_request).await.unwrap();
     }
+
+    #[tokio::test]
+    async fn test_router_returns_rate_limited_failure() {
+        let self_node_id = "test-router".into();
+        let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
+        let ingester_pool = IngesterPool::default();
+        let replication_factor = 1;
+        let router = IngestRouter::new(
+            self_node_id,
+            control_plane,
+            ingester_pool.clone(),
+            replication_factor,
+            EventBroker::default(),
+        );
+        let mut state_guard = router.state.lock().await;
+        let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
+
+        state_guard.routing_table.replace_shards(
+            index_uid.clone(),
+            "test-source",
+            vec![Shard {
+                index_uid: Some(index_uid.clone()),
+                source_id: "test-source".to_string(),
+                shard_id: Some(ShardId::from(1)),
+                shard_state: ShardState::Open as i32,
+                leader_id: "test-ingester-0".to_string(),
+                ..Default::default()
+            }],
+        );
+        drop(state_guard);
+
+        let mut mock_ingester_0 = MockIngesterService::new();
+        mock_ingester_0
+            .expect_persist()
+            .times(1)
+            .returning(move |request| {
+                assert_eq!(request.leader_id, "test-ingester-0");
+                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
+                assert_eq!(request.subrequests.len(), 1);
+                let subrequest = &request.subrequests[0];
+                assert_eq!(subrequest.subrequest_id, 0);
+                let index_uid = subrequest.index_uid().clone();
+                assert_eq!(subrequest.source_id, "test-source");
+                assert_eq!(subrequest.shard_id(), ShardId::from(1));
+                assert_eq!(
+                    subrequest.doc_batch,
+                    Some(DocBatchV2::for_test(["test-doc-foo"]))
+                );
+
+                let response = PersistResponse {
+                    leader_id: request.leader_id,
+                    successes: Vec::new(),
+                    failures: vec![PersistFailure {
+                        subrequest_id: 0,
+                        index_uid: Some(index_uid),
+                        source_id: "test-source".to_string(),
+                        shard_id: Some(ShardId::from(1)),
+                        reason: PersistFailureReason::ShardRateLimited as i32,
+                    }],
+                };
+                Ok(response)
+            });
+        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
+
+        let ingest_request = IngestRequestV2 {
+            subrequests: vec![IngestSubrequest {
+                subrequest_id: 0,
+                index_id: "test-index-0".to_string(),
+                source_id: "test-source".to_string(),
+                doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])),
+            }],
+            commit_type: CommitTypeV2::Auto as i32,
+        };
+        let ingest_response = router.ingest(ingest_request).await.unwrap();
+        assert_eq!(ingest_response.successes.len(), 0);
+        assert_eq!(ingest_response.failures.len(), 1);
+        assert_eq!(
+            ingest_response.failures[0].reason(),
+            IngestFailureReason::ShardRateLimited
+        );
+    }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
@@ -143,7 +143,9 @@ impl RoutingTableEntry {
         &self,
         ingester_pool: &IngesterPool,
         rate_limited_shards: &HashSet<ShardId>,
-    ) -> Option<&RoutingEntry> {
+    ) -> Result<&RoutingEntry, NextOpenShardError> {
+        let mut error = NextOpenShardError::NoShardsAvailable;
+
         for (shards, round_robin_idx) in [
             (&self.local_shards, &self.local_round_robin_idx),
             (&self.remote_shards, &self.remote_round_robin_idx),
@@ -154,17 +156,20 @@ impl RoutingTableEntry {
             for _attempt in 0..shards.len() {
                 let shard_idx = round_robin_idx.fetch_add(1, Ordering::Relaxed);
                 let shard_routing_entry: &RoutingEntry = &shards[shard_idx % shards.len()];
-                if !shard_routing_entry.shard_state.is_open()
-                    || rate_limited_shards.contains(&shard_routing_entry.shard_id)
-                {
+
+                if !shard_routing_entry.shard_state.is_open() {
+                    continue;
+                }
+                if rate_limited_shards.contains(&shard_routing_entry.shard_id) {
+                    error = NextOpenShardError::RateLimited;
                     continue;
                 }
                 if ingester_pool.contains_key(&shard_routing_entry.leader_id) {
-                    return Some(shard_routing_entry);
+                    return Ok(shard_routing_entry);
                 }
             }
         }
-        None
+        Err(error)
     }
 
     /// Inserts the open shards the routing table is not aware of.
@@ -323,6 +328,12 @@ impl RoutingTableEntry {
     }
 }
 
+#[derive(Debug, PartialEq, Eq)]
+pub(super) enum NextOpenShardError {
+    NoShardsAvailable,
+    RateLimited,
+}
+
 /// Stores the list of shards the router is aware of for each index and source. The resolution from
 /// index and source to shards is performed using index ID (not index UID) and source ID.
 #[derive(Debug)]
@@ -657,12 +668,12 @@ mod tests {
         let source_id: SourceId = "test-source".into();
         let table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone());
         let ingester_pool = IngesterPool::default();
-
         let mut rate_limited_shards = HashSet::new();
 
-        let shard_opt =
-            table_entry.next_open_shard_round_robin(&ingester_pool, &rate_limited_shards);
-        assert!(shard_opt.is_none());
+        let error = table_entry
+            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
+            .unwrap_err();
+        assert_eq!(error, NextOpenShardError::NoShardsAvailable);
 
         ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
         ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked());
@@ -778,6 +789,36 @@ mod tests {
         assert_eq!(shard.shard_id, ShardId::from(2));
     }
 
+    #[test]
+    fn test_routing_table_entry_next_open_shard_round_robin_rate_limited_error() {
+        let index_uid = IndexUid::for_test("test-index", 0);
+        let source_id: SourceId = "test-source".into();
+
+        let ingester_pool = IngesterPool::default();
+        ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
+
+        let rate_limited_shards = HashSet::from_iter([ShardId::from(1)]);
+
+        let table_entry = RoutingTableEntry {
+            index_uid: index_uid.clone(),
+            source_id: source_id.clone(),
+            local_shards: vec![RoutingEntry {
+                index_uid: index_uid.clone(),
+                source_id: "test-source".to_string(),
+                shard_id: ShardId::from(1),
+                shard_state: ShardState::Open,
+                leader_id: "test-ingester-0".into(),
+            }],
+            local_round_robin_idx: AtomicUsize::default(),
+            remote_shards: Vec::new(),
+            remote_round_robin_idx: AtomicUsize::default(),
+        };
+        let error = table_entry
+            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
+            .unwrap_err();
+        assert_eq!(error, NextOpenShardError::RateLimited);
+    }
+
     #[test]
     fn test_routing_table_entry_insert_open_shards() {
         let index_uid_0 = IndexUid::for_test("test-index", 0);
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
@@ -37,7 +37,7 @@ use super::router::PersistRequestSummary;
 #[derive(Default)]
 pub(super) struct IngestWorkbench {
     pub subworkbenches: BTreeMap<SubrequestId, IngestSubworkbench>,
-    pub rate_limited_shard: HashSet<ShardId>,
+    pub rate_limited_shards: HashSet<ShardId>,
     pub num_successes: usize,
     /// The number of batch persist attempts. This is not sum of the number of attempts for each
     /// subrequest.
@@ -240,6 +240,13 @@ impl IngestWorkbench {
         self.record_failure(subrequest_id, SubworkbenchFailure::NoShardsAvailable);
     }
 
+    pub fn record_rate_limited(&mut self, subrequest_id: SubrequestId) {
+        self.record_failure(
+            subrequest_id,
+            SubworkbenchFailure::RateLimited(RateLimitingCause::ShardRateLimiting),
+        );
+    }
+
     /// Marks a node as unavailable for the span of the workbench.
     ///
     /// Remaining attempts will treat the node as if it was not in the ingester pool.
diff --git a/quickwit/quickwit-proto/src/ingest/mod.rs b/quickwit/quickwit-proto/src/ingest/mod.rs
@@ -36,7 +36,7 @@ pub type IngestV2Result<T> = std::result::Result<T, IngestV2Error>;
 pub enum RateLimitingCause {
     #[error("router load shedding")]
     RouterLoadShedding,
-    #[error("load shadding")]
+    #[error("load shedding")]
     LoadShedding,
     #[error("wal full (memory or disk)")]
     WalFull,