Skip to content

Commit 37c8f96

Browse files
committed
adding more commentary and utility functions
1 parent 0a702af commit 37c8f96

File tree

1 file changed

+138
-9
lines changed

1 file changed

+138
-9
lines changed

s2/region_term_indexer.go

+138-9
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020 Google Inc. All rights reserved.
1+
// Copyright 2021 Google Inc. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -12,9 +12,69 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//
16+
// Indexing Strategy
17+
// -----------------
18+
//
19+
// Given a query region, we want to find all of the document regions that
20+
// intersect it. The first step is to represent all the regions as S2Cell
21+
// coverings (see S2RegionCoverer). We then split the problem into two parts,
22+
// namely finding the document regions that are "smaller" than the query
23+
// region and those that are "larger" than the query region.
24+
//
25+
// We do this by defining two terms for each S2CellId: a "covering term" and
26+
// an "ancestor term". (In the implementation below, covering terms are
27+
// distinguished by prefixing a '$' to them.) For each document region, we
28+
// insert a covering term for every cell in the region's covering, and we
29+
// insert an ancestor term for these cells *and* all of their ancestors.
30+
//
31+
// Then given a query region, we can look up all the document regions that
32+
// intersect its covering by querying the union of the following terms:
33+
//
34+
// 1. An "ancestor term" for each cell in the query region. These terms
35+
// ensure that we find all document regions that are "smaller" than the
36+
// query region, i.e. where the query region contains a cell that is either
37+
// a cell of a document region or one of its ancestors.
38+
//
39+
// 2. A "covering term" for every ancestor of the cells in the query region.
40+
// These terms ensure that we find all the document regions that are
41+
// "larger" than the query region, i.e. where document region contains a
42+
// cell that is a (proper) ancestor of a cell in the query region.
43+
//
44+
// Together, these terms find all of the document regions that intersect the
45+
// query region. Furthermore, the number of terms to be indexed and queried
46+
// are both fairly small, and can be bounded in terms of max_cells() and the
47+
// number of cell levels used.
48+
//
49+
// Optimizations
50+
// -------------
51+
//
52+
// + Cells at the maximum level being indexed (max_level()) have the special
53+
// property that they will never be an ancestor of a cell in the query
54+
// region. Therefore we can safely skip generating "covering terms" for
55+
// these cells (see query step 2 above).
56+
//
57+
// + If the index will contain only points (rather than general regions), then
58+
// we can skip all the covering terms mentioned above because there will
59+
// never be any document regions larger than the query region. This can
60+
// significantly reduce the size of queries.
61+
//
62+
// + If it is more important to optimize index size rather than query speed,
63+
// the number of index terms can be reduced by creating ancestor terms only
64+
// for the *proper* ancestors of the cells in a document region, and
65+
// compensating for this by including covering terms for all cells in the
66+
// query region (in addition to their ancestors).
67+
//
68+
// Effectively, when the query region and a document region contain exactly
69+
// the same cell, we have a choice about whether to treat this match as a
70+
// "covering term" or an "ancestor term". One choice minimizes query size
71+
// while the other minimizes index size.
72+
1573
package s2
1674

1775
import (
76+
"strings"
77+
1878
"github.com/golang/geo/s1"
1979
)
2080

@@ -86,6 +146,49 @@ func (o *Options) trueMaxLevel() int {
86146
return trueMax
87147
}
88148

149+
// RegionTermIndexer is a helper struct for adding spatial data to an
150+
// information retrieval system. Such systems work by converting documents
151+
// into a collection of "index terms" (e.g., representing words or phrases),
152+
// and then building an "inverted index" that maps each term to a list of
153+
// documents (and document positions) where that term occurs.
154+
//
155+
// This class deals with the problem of converting spatial data into index
156+
// terms, which can then be indexed along with the other document information.
157+
//
158+
// Spatial data is represented using the S2Region type. Useful S2Region
159+
// subtypes include:
160+
//
161+
// S2Cap
162+
// - a disc-shaped region
163+
//
164+
// S2LatLngRect
165+
// - a rectangle in latitude-longitude coordinates
166+
//
167+
// S2Polyline
168+
// - a polyline
169+
//
170+
// S2Polygon
171+
// - a polygon, possibly with multiple holes and/or shells
172+
//
173+
// S2CellUnion
174+
// - a region approximated as a collection of S2CellIds
175+
//
176+
// S2ShapeIndexRegion
177+
// - an arbitrary collection of points, polylines, and polygons
178+
//
179+
// S2ShapeIndexBufferedRegion
180+
// - like the above, but expanded by a given radius
181+
//
182+
// S2RegionUnion, S2RegionIntersection
183+
// - the union or intersection of arbitrary other regions
184+
//
185+
// So for example, if you want to query documents that are within 500 meters
186+
// of a polyline, you could use an S2ShapeIndexBufferedRegion containing the
187+
// polyline with a radius of 500 meters.
188+
//
189+
// For example usage refer:
190+
// https://github.com/google/s2geometry/blob/ad1489e898f369ca09e2099353ccd55bd0fd7a26/src/s2/s2region_term_indexer.h#L58
191+
89192
type RegionTermIndexer struct {
90193
options Options
91194
regionCoverer RegionCoverer
@@ -109,17 +212,21 @@ func NewRegionTermIndexerWithOptions(option Options) *RegionTermIndexer {
109212

110213
func (rti *RegionTermIndexer) GetTerm(termTyp TermType, id CellID,
111214
prefix string) string {
112-
return prefix + id.ToToken()
113-
/*
114-
TODO - revisit this if needed.
115-
if termTyp == ANCESTOR {
116-
return prefix + id.ToToken()
117-
}
118-
return prefix + marker + id.ToToken()
119-
*/
215+
if termTyp == ANCESTOR {
216+
return prefix + id.ToToken()
217+
}
218+
return prefix + marker + id.ToToken()
120219
}
121220

122221
func (rti *RegionTermIndexer) GetIndexTermsForPoint(p Point, prefix string) []string {
222+
// See the top of this file for an overview of the indexing strategy.
223+
//
224+
// The last cell generated by this loop is effectively the covering for
225+
// the given point. You might expect that this cell would be indexed as a
226+
// covering term, but as an optimization we always index these cells as
227+
// ancestor terms only. This is possible because query regions will never
228+
// contain a descendant of such cells. Note that this is true even when
229+
// max_level() != true_max_level() (see S2RegionCoverer::Options).
123230
cellID := cellIDFromPoint(p)
124231
var rv []string
125232
for l := rti.options.minLevel; l <= rti.options.maxLevel; l += rti.options.levelMod {
@@ -141,6 +248,14 @@ func (rti *RegionTermIndexer) GetIndexTermsForRegion(region Region,
141248

142249
func (rti *RegionTermIndexer) GetIndexTermsForCanonicalCovering(
143250
covering CellUnion, prefix string) []string {
251+
// See the top of this file for an overview of the indexing strategy.
252+
//
253+
// Cells in the covering are normally indexed as covering terms. If we are
254+
// optimizing for query time rather than index space, they are also indexed
255+
// as ancestor terms (since this lets us reduce the number of terms in the
256+
// query). Finally, as an optimization we always index true_max_level()
257+
// cells as ancestor cells only, since these cells have the special property
258+
// that query regions will never contain a descendant of these cells.
144259
var rv []string
145260
prevID := CellID(0)
146261
tml := rti.options.trueMaxLevel()
@@ -237,3 +352,17 @@ func CapFromCenterAndRadius(centerLat, centerLon, dist float64) Cap {
237352
return CapFromCenterAngle(PointFromLatLng(
238353
LatLngFromDegrees(centerLat, centerLon)), s1.Angle((dist/1000)/6378))
239354
}
355+
356+
// FilterOutCoveringTerms filters out the covering terms so that
357+
// it helps to reduce the search terms while searching in a one
358+
// dimensional space. (point only indexing usecase)
359+
func FilterOutCoveringTerms(terms []string) []string {
360+
rv := make([]string, 0, len(terms))
361+
for _, term := range terms {
362+
if strings.HasPrefix(term, marker) {
363+
continue
364+
}
365+
rv = append(rv, term)
366+
}
367+
return rv
368+
}

0 commit comments

Comments
 (0)