Skip to content

Commit cb52ca6

Browse files
committed
Optimize time ranged leaf search queries
When the search request contains a time range, we aborted the optimization of converting unneeded split searches into count queries.
1 parent 72d5d1d commit cb52ca6

File tree

1 file changed

+75
-23
lines changed
  • quickwit/quickwit-search/src

1 file changed

+75
-23
lines changed

quickwit/quickwit-search/src/leaf.rs

+75-23
Original file line numberDiff line numberDiff line change
@@ -942,11 +942,6 @@ fn is_simple_all_query(search_request: &SearchRequest) -> bool {
942942
return false;
943943
}
944944

945-
// TODO: Update the logic to handle start_timestamp end_timestamp ranges
946-
if search_request.start_timestamp.is_some() || search_request.end_timestamp.is_some() {
947-
return false;
948-
}
949-
950945
let Ok(query_ast) = serde_json::from_str(&search_request.query_ast) else {
951946
return false;
952947
};
@@ -1000,6 +995,29 @@ impl CanSplitDoBetter {
1000995
}
1001996
}
1002997

998+
fn is_split_contained_in_search_time_range(
999+
split: &SplitIdAndFooterOffsets,
1000+
search_request: &SearchRequest,
1001+
) -> bool {
1002+
if let Some(start) = search_request.start_timestamp {
1003+
let Some(split_start) = split.timestamp_start else {
1004+
return false;
1005+
};
1006+
if split_start < start {
1007+
return false;
1008+
}
1009+
}
1010+
if let Some(end) = search_request.end_timestamp {
1011+
let Some(split_end) = split.timestamp_end else {
1012+
return false;
1013+
};
1014+
if split_end >= end {
1015+
return false;
1016+
}
1017+
}
1018+
true
1019+
}
1020+
10031021
fn to_splits_with_request(
10041022
splits: Vec<SplitIdAndFooterOffsets>,
10051023
request: Arc<SearchRequest>,
@@ -1011,23 +1029,33 @@ impl CanSplitDoBetter {
10111029
}
10121030

10131031
/// Calculate the number of splits which are guaranteed to deliver enough documents.
1032+
///
1033+
/// If there's a time range and not enough splits contain at least the number of requested
1034+
/// documents, return None.
10141035
fn get_min_required_splits(
10151036
splits: &[SplitIdAndFooterOffsets],
10161037
request: &SearchRequest,
1017-
) -> usize {
1038+
) -> Option<usize> {
10181039
let num_requested_docs = request.start_offset + request.max_hits;
10191040

1020-
splits
1021-
.into_iter()
1022-
.map(|split| split.num_docs)
1023-
// computing the partial sum
1024-
.scan(0u64, |partial_sum: &mut u64, num_docs_in_split: u64| {
1025-
*partial_sum += num_docs_in_split;
1026-
Some(*partial_sum)
1027-
})
1028-
.take_while(|partial_sum| *partial_sum < num_requested_docs)
1029-
.count()
1030-
+ 1
1041+
let mut min_required_splits = 0;
1042+
let mut partial_sum = 0u64;
1043+
1044+
for split in splits.iter() {
1045+
if !Self::is_split_contained_in_search_time_range(split, &request) {
1046+
continue;
1047+
}
1048+
1049+
partial_sum += split.num_docs;
1050+
1051+
if partial_sum >= num_requested_docs {
1052+
return Some(min_required_splits + 1);
1053+
}
1054+
1055+
min_required_splits += 1;
1056+
}
1057+
1058+
None
10311059
}
10321060

10331061
fn optimize_split_id_higher(
@@ -1042,7 +1070,11 @@ impl CanSplitDoBetter {
10421070
return Ok(Self::to_splits_with_request(splits, request));
10431071
}
10441072

1045-
let min_required_splits = Self::get_min_required_splits(&splits, &request);
1073+
let Some(min_required_splits) = Self::get_min_required_splits(&splits, &request) else {
1074+
// not enough splits contained in time range.
1075+
return Ok(Self::to_splits_with_request(splits, request));
1076+
};
1077+
10461078
let mut split_with_req = Self::to_splits_with_request(splits, request);
10471079

10481080
// In this case there is no sort order, we order by split id.
@@ -1060,14 +1092,21 @@ impl CanSplitDoBetter {
10601092
request: Arc<SearchRequest>,
10611093
mut splits: Vec<SplitIdAndFooterOffsets>,
10621094
) -> Result<Vec<(SplitIdAndFooterOffsets, SearchRequest)>, SearchError> {
1063-
splits.sort_unstable_by_key(|split| std::cmp::Reverse(split.timestamp_end()));
1095+
splits.sort_unstable_by_key(|split| {
1096+
let contained = Self::is_split_contained_in_search_time_range(split, &request);
1097+
(!contained, std::cmp::Reverse(split.timestamp_end()))
1098+
});
10641099

10651100
if !is_simple_all_query(&request) {
10661101
// no optimization opportunity here.
10671102
return Ok(Self::to_splits_with_request(splits, request));
10681103
}
10691104

1070-
let min_required_splits = Self::get_min_required_splits(&splits, &request);
1105+
let Some(min_required_splits) = Self::get_min_required_splits(&splits, &request) else {
1106+
// not enough splits contained in time range.
1107+
return Ok(Self::to_splits_with_request(splits, request));
1108+
};
1109+
10711110
let mut split_with_req = Self::to_splits_with_request(splits, request);
10721111

10731112
// We order by timestamp desc. split_with_req is sorted by timestamp_end desc.
@@ -1097,14 +1136,21 @@ impl CanSplitDoBetter {
10971136
request: Arc<SearchRequest>,
10981137
mut splits: Vec<SplitIdAndFooterOffsets>,
10991138
) -> Result<Vec<(SplitIdAndFooterOffsets, SearchRequest)>, SearchError> {
1100-
splits.sort_unstable_by_key(|split| split.timestamp_start());
1139+
splits.sort_unstable_by_key(|split| {
1140+
let contained = Self::is_split_contained_in_search_time_range(split, &request);
1141+
(!contained, split.timestamp_start())
1142+
});
11011143

11021144
if !is_simple_all_query(&request) {
11031145
// no optimization opportunity here.
11041146
return Ok(Self::to_splits_with_request(splits, request));
11051147
}
11061148

1107-
let min_required_splits = Self::get_min_required_splits(&splits, &request);
1149+
let Some(min_required_splits) = Self::get_min_required_splits(&splits, &request) else {
1150+
// not enough splits contained in time range.
1151+
return Ok(Self::to_splits_with_request(splits, request));
1152+
};
1153+
11081154
let mut split_with_req = Self::to_splits_with_request(splits, request);
11091155

11101156
// We order by timestamp asc. split_with_req is sorted by timestamp_start.
@@ -1141,7 +1187,10 @@ impl CanSplitDoBetter {
11411187
request: Arc<SearchRequest>,
11421188
mut splits: Vec<SplitIdAndFooterOffsets>,
11431189
) -> Result<Vec<(SplitIdAndFooterOffsets, SearchRequest)>, SearchError> {
1144-
splits.sort_unstable_by_key(|split| std::cmp::Reverse(split.timestamp_end()));
1190+
splits.sort_unstable_by_key(|split| {
1191+
let contained = Self::is_split_contained_in_search_time_range(split, &request);
1192+
(!contained, std::cmp::Reverse(split.timestamp_end()))
1193+
});
11451194

11461195
if !is_simple_all_query(&request) {
11471196
// no optimization opportunity here.
@@ -1154,6 +1203,9 @@ impl CanSplitDoBetter {
11541203
/// This function tries to detect upfront which splits contain the top n hits and convert other
11551204
/// split searches to count only searches. It also optimizes split order.
11561205
///
1206+
/// To skip splits in time ranged queries, we sort the splits first by whether they are
1207+
/// contained in the search request time range.
1208+
///
11571209
/// Returns the search_requests with their split.
11581210
fn optimize(
11591211
&self,

0 commit comments

Comments
 (0)