Skip to content

Commit e238ae9

Browse files
committed
internal/devtools/cmd/rmdoc: delete crawled pages from corpus
Gaby splits each crawled webpage into docs for embedding, computes embedding, and store them in the vector db. Delete all the docs and their embedding. This is meant to be run after the webpage is excluded from crawling with Crawler.Deny. For #63 Change-Id: I095a65b9a834ccf48062facc3654f40b43562e15 Reviewed-on: https://go-review.googlesource.com/c/oscar/+/635176 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Jonathan Amsterdam <[email protected]>
1 parent 5e25bc0 commit e238ae9

File tree

3 files changed

+151
-1
lines changed

3 files changed

+151
-1
lines changed

internal/devtools/cmd/rmdoc/main.go

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// Copyright 2024 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// rmdoc deletes the documents from the corpus (including the vector db).
6+
//
7+
// Usage: go run . -project oscar-go-1 -firestoredb devel https://go.dev/x/y/z
8+
package main
9+
10+
import (
11+
"context"
12+
"flag"
13+
"fmt"
14+
"log"
15+
"log/slog"
16+
"strings"
17+
18+
"cloud.google.com/go/compute/metadata"
19+
"golang.org/x/oscar/internal/dbspec"
20+
"golang.org/x/oscar/internal/docs"
21+
"golang.org/x/oscar/internal/gcp/firestore"
22+
"golang.org/x/oscar/internal/pebble"
23+
"golang.org/x/oscar/internal/storage"
24+
)
25+
26+
var flags = struct {
27+
project string
28+
firestoredb string
29+
overlay string
30+
}{}
31+
32+
func init() {
33+
flag.StringVar(&flags.project, "project", "", "name of the Google Cloud Project")
34+
flag.StringVar(&flags.firestoredb, "firestoredb", "", "name of the firestore db")
35+
}
36+
37+
var logger = slog.Default()
38+
39+
func main() {
40+
flag.Parse()
41+
42+
args := flag.Args()
43+
if len(args) == 0 {
44+
log.Fatal("no args")
45+
}
46+
47+
gabyDB, gabyVectorDB := initGCP()
48+
corpus := docs.New(logger, gabyDB)
49+
50+
for _, url := range args {
51+
if !strings.HasPrefix(url, "https://go.dev/") {
52+
log.Println("ignoring unrecognized url:", url)
53+
continue
54+
}
55+
56+
// TODO: do we need to delete crawl.Page entries too?
57+
58+
for doc := range corpus.Docs(url) {
59+
hasVector := " "
60+
if _, ok := gabyVectorDB.Get(doc.ID); ok {
61+
hasVector = "*"
62+
}
63+
fmt.Printf("%v %v", hasVector, doc.ID)
64+
65+
fmt.Printf(" delete (y/N)? ")
66+
var a string
67+
fmt.Scanln(&a)
68+
if answer := strings.ToLower(strings.TrimSpace(a)); answer == "y" || answer == "yes" {
69+
gabyVectorDB.Delete(doc.ID)
70+
corpus.Delete(doc.ID)
71+
if _, ok := gabyVectorDB.Get(doc.ID); ok {
72+
log.Fatalf("error - %v not removed from vector db", doc.ID)
73+
}
74+
fmt.Print(" ↪ deleted")
75+
} else {
76+
fmt.Print(" ↪ skipped")
77+
}
78+
fmt.Println()
79+
}
80+
}
81+
}
82+
83+
func initGCP() (storage.DB, storage.VectorDB) {
84+
ctx := context.TODO()
85+
86+
if flags.project == "" {
87+
projectID, err := metadata.ProjectIDWithContext(ctx)
88+
if err != nil {
89+
log.Fatalf("metadata project ID: %v", err)
90+
}
91+
if projectID == "" {
92+
log.Fatal("project ID from metadata is empty")
93+
}
94+
flags.project = projectID
95+
}
96+
97+
db, err := openDB(&dbspec.Spec{
98+
Kind: "firestore",
99+
Location: flags.project,
100+
Name: flags.firestoredb,
101+
})
102+
if err != nil {
103+
log.Fatal(err)
104+
}
105+
106+
const vectorDBNamespace = "gaby"
107+
vdb, err := firestore.NewVectorDB(ctx, slog.Default(), flags.project, flags.firestoredb, vectorDBNamespace)
108+
if err != nil {
109+
log.Fatal(err)
110+
}
111+
return db, vdb
112+
}
113+
114+
// openDB opens the database described by spec.
115+
func openDB(spec *dbspec.Spec) (storage.DB, error) {
116+
switch spec.Kind {
117+
case "mem":
118+
return storage.MemDB(), nil
119+
case "pebble":
120+
return pebble.Open(logger, spec.Location)
121+
case "firestore":
122+
return firestore.NewDB(context.TODO(), logger, spec.Location, spec.Name)
123+
default:
124+
return nil, fmt.Errorf("unknown DB kind %q", spec.Kind)
125+
}
126+
}

internal/docs/docs.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func (c *Corpus) Get(id string) (doc *Doc, ok bool) {
7777

7878
// Add adds a document with the given id, title, and text.
7979
// If the document already exists in the corpus with the same title and text,
80-
// Add is an no-op.
80+
// Add is a no-op.
8181
// Otherwise, if the document already exists in the corpus, it is replaced.
8282
func (c *Corpus) Add(id, title, text string) {
8383
old, ok := c.Get(id)
@@ -89,6 +89,18 @@ func (c *Corpus) Add(id, title, text string) {
8989
b.Apply()
9090
}
9191

92+
// Delete deletes a document with the given id.
93+
// If the document does not exist inthe corpus, Delete is a no-op.
94+
func (c *Corpus) Delete(id string) {
95+
doc, ok := c.Get(id)
96+
if !ok {
97+
return
98+
}
99+
b := c.db.Batch()
100+
timed.Delete(c.db, b, docsKind, ordered.Encode(doc.ID))
101+
b.Apply()
102+
}
103+
92104
// Docs returns an iterator over all documents in the corpus
93105
// with IDs starting with a given prefix.
94106
// The documents are ordered by ID.

internal/docs/docs_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,16 @@ func TestCorpus(t *testing.T) {
117117
if !slices.Equal(ids, want) {
118118
t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want)
119119
}
120+
121+
// After Delete id1.
122+
corpus.Delete("id1")
123+
corpus.Delete("id1111") // doesn't exist
124+
ids = nil
125+
for d := range corpus.Docs("id1") {
126+
do(d)
127+
}
128+
want = []string{"id11"}
129+
if !slices.Equal(ids, want) {
130+
t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want)
131+
}
120132
}

0 commit comments

Comments
 (0)