Skip to content

Commit 26a3171

Browse files
committed
feat: web scraping with robots.txt support
1 parent 61c953c commit 26a3171

File tree

5 files changed

+343
-5
lines changed

5 files changed

+343
-5
lines changed

Cargo.lock

Lines changed: 111 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/q_cli/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ flume.workspace = true
5757
futures.workspace = true
5858
glob.workspace = true
5959
globset.workspace = true
60+
htmd = "0.1"
6061
indicatif.workspace = true
6162
indoc.workspace = true
6263
mimalloc.workspace = true

crates/q_cli/src/cli/chat/tools/mod.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ pub mod fs_read;
33
pub mod fs_write;
44
pub mod gh_issue;
55
pub mod use_aws;
6+
pub mod web_search;
67

78
use std::collections::HashMap;
89
use std::io::Write;
@@ -29,6 +30,7 @@ use fs_write::FsWrite;
2930
use gh_issue::GhIssue;
3031
use serde::Deserialize;
3132
use use_aws::UseAws;
33+
use web_search::WebSearch;
3234

3335
use super::parser::ToolUse;
3436

@@ -42,6 +44,7 @@ pub enum Tool {
4244
ExecuteBash(ExecuteBash),
4345
UseAws(UseAws),
4446
GhIssue(GhIssue),
47+
WebSearch(WebSearch),
4548
}
4649

4750
impl Tool {
@@ -53,6 +56,7 @@ impl Tool {
5356
Tool::ExecuteBash(_) => "execute_bash",
5457
Tool::UseAws(_) => "use_aws",
5558
Tool::GhIssue(_) => "gh_issue",
59+
Tool::WebSearch(_) => "web_search",
5660
}
5761
}
5862

@@ -64,6 +68,7 @@ impl Tool {
6468
Tool::ExecuteBash(execute_bash) => execute_bash.requires_acceptance(),
6569
Tool::UseAws(use_aws) => use_aws.requires_acceptance(),
6670
Tool::GhIssue(_) => false,
71+
Tool::WebSearch(_) => false,
6772
}
6873
}
6974

@@ -75,6 +80,7 @@ impl Tool {
7580
Tool::ExecuteBash(execute_bash) => execute_bash.invoke(updates).await,
7681
Tool::UseAws(use_aws) => use_aws.invoke(context, updates).await,
7782
Tool::GhIssue(gh_issue) => gh_issue.invoke(updates).await,
83+
Tool::WebSearch(web_search) => web_search.invoke(updates).await,
7884
}
7985
}
8086

@@ -86,6 +92,7 @@ impl Tool {
8692
Tool::ExecuteBash(execute_bash) => execute_bash.queue_description(updates),
8793
Tool::UseAws(use_aws) => use_aws.queue_description(updates),
8894
Tool::GhIssue(gh_issue) => gh_issue.queue_description(updates),
95+
Tool::WebSearch(web_search) => web_search.queue_description(updates),
8996
}
9097
}
9198

@@ -97,6 +104,7 @@ impl Tool {
97104
Tool::ExecuteBash(execute_bash) => execute_bash.validate(ctx).await,
98105
Tool::UseAws(use_aws) => use_aws.validate(ctx).await,
99106
Tool::GhIssue(gh_issue) => gh_issue.validate(ctx).await,
107+
Tool::WebSearch(web_search) => web_search.validate(ctx).await,
100108
}
101109
}
102110
}
@@ -119,6 +127,7 @@ impl TryFrom<ToolUse> for Tool {
119127
"execute_bash" => Self::ExecuteBash(serde_json::from_value::<ExecuteBash>(value.args).map_err(map_err)?),
120128
"use_aws" => Self::UseAws(serde_json::from_value::<UseAws>(value.args).map_err(map_err)?),
121129
"report_issue" => Self::GhIssue(serde_json::from_value::<GhIssue>(value.args).map_err(map_err)?),
130+
"web_search" => Self::WebSearch(serde_json::from_value::<WebSearch>(value.args).map_err(map_err)?),
122131
unknown => {
123132
return Err(ToolResult {
124133
tool_use_id: value.id,
@@ -201,6 +210,7 @@ impl ToolPermissions {
201210
"execute_bash" => "trust read-only commands".dark_grey(),
202211
"use_aws" => "trust read-only commands".dark_grey(),
203212
"report_issue" => "trusted".dark_green().bold(),
213+
"web_search" => "trusted".dark_green().bold(),
204214
_ => "not trusted".dark_grey(),
205215
};
206216

crates/q_cli/src/cli/chat/tools/tool_index.json

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,33 @@
147147
]
148148
}
149149
},
150+
"web_search": {
151+
"name": "web_search",
152+
"description": "Search/scrape the web for the specified query. Currently only supports scraping.",
153+
"input_schema": {
154+
"type": "object",
155+
"properties": {
156+
"query": {
157+
"type": "string",
158+
"description": "The search query to use. This is optional when mode is set to scrape since the target_url will be used instead."
159+
},
160+
"mode": {
161+
"type": "string",
162+
"enum": [
163+
"Scrape"
164+
],
165+
"description": "Scrape mode will return the markdown representation of the page. Search mode will return the first x results from a search engine."
166+
},
167+
"target_url": {
168+
"type": "string",
169+
"description": "The target URL to scrape. This is only used in scrape mode."
170+
}
171+
},
172+
"required": [
173+
"mode"
174+
]
175+
}
176+
},
150177
"gh_issue": {
151178
"name": "report_issue",
152179
"description": "Opens the browser to a pre-filled gh (GitHub) issue template to report chat issues, bugs, or feature requests. Pre-filled information includes the conversation transcript, chat context, and chat request IDs from the service.",

0 commit comments

Comments
 (0)