1
- // Copyright 2020 Google Inc. All rights reserved.
1
+ // Copyright 2021 Google Inc. All rights reserved.
2
2
//
3
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
4
// you may not use this file except in compliance with the License.
12
12
// See the License for the specific language governing permissions and
13
13
// limitations under the License.
14
14
15
+ //
16
+ // Indexing Strategy
17
+ // -----------------
18
+ //
19
+ // Given a query region, we want to find all of the document regions that
20
+ // intersect it. The first step is to represent all the regions as S2Cell
21
+ // coverings (see S2RegionCoverer). We then split the problem into two parts,
22
+ // namely finding the document regions that are "smaller" than the query
23
+ // region and those that are "larger" than the query region.
24
+ //
25
+ // We do this by defining two terms for each S2CellId: a "covering term" and
26
+ // an "ancestor term". (In the implementation below, covering terms are
27
+ // distinguished by prefixing a '$' to them.) For each document region, we
28
+ // insert a covering term for every cell in the region's covering, and we
29
+ // insert an ancestor term for these cells *and* all of their ancestors.
30
+ //
31
+ // Then given a query region, we can look up all the document regions that
32
+ // intersect its covering by querying the union of the following terms:
33
+ //
34
+ // 1. An "ancestor term" for each cell in the query region. These terms
35
+ // ensure that we find all document regions that are "smaller" than the
36
+ // query region, i.e. where the query region contains a cell that is either
37
+ // a cell of a document region or one of its ancestors.
38
+ //
39
+ // 2. A "covering term" for every ancestor of the cells in the query region.
40
+ // These terms ensure that we find all the document regions that are
41
+ // "larger" than the query region, i.e. where document region contains a
42
+ // cell that is a (proper) ancestor of a cell in the query region.
43
+ //
44
+ // Together, these terms find all of the document regions that intersect the
45
+ // query region. Furthermore, the number of terms to be indexed and queried
46
+ // are both fairly small, and can be bounded in terms of max_cells() and the
47
+ // number of cell levels used.
48
+ //
49
+ // Optimizations
50
+ // -------------
51
+ //
52
+ // + Cells at the maximum level being indexed (max_level()) have the special
53
+ // property that they will never be an ancestor of a cell in the query
54
+ // region. Therefore we can safely skip generating "covering terms" for
55
+ // these cells (see query step 2 above).
56
+ //
57
+ // + If the index will contain only points (rather than general regions), then
58
+ // we can skip all the covering terms mentioned above because there will
59
+ // never be any document regions larger than the query region. This can
60
+ // significantly reduce the size of queries.
61
+ //
62
+ // + If it is more important to optimize index size rather than query speed,
63
+ // the number of index terms can be reduced by creating ancestor terms only
64
+ // for the *proper* ancestors of the cells in a document region, and
65
+ // compensating for this by including covering terms for all cells in the
66
+ // query region (in addition to their ancestors).
67
+ //
68
+ // Effectively, when the query region and a document region contain exactly
69
+ // the same cell, we have a choice about whether to treat this match as a
70
+ // "covering term" or an "ancestor term". One choice minimizes query size
71
+ // while the other minimizes index size.
72
+
15
73
package s2
16
74
17
75
import (
76
+ "strings"
77
+
18
78
"github.com/golang/geo/s1"
19
79
)
20
80
@@ -86,6 +146,49 @@ func (o *Options) trueMaxLevel() int {
86
146
return trueMax
87
147
}
88
148
149
+ // RegionTermIndexer is a helper struct for adding spatial data to an
150
+ // information retrieval system. Such systems work by converting documents
151
+ // into a collection of "index terms" (e.g., representing words or phrases),
152
+ // and then building an "inverted index" that maps each term to a list of
153
+ // documents (and document positions) where that term occurs.
154
+ //
155
+ // This class deals with the problem of converting spatial data into index
156
+ // terms, which can then be indexed along with the other document information.
157
+ //
158
+ // Spatial data is represented using the S2Region type. Useful S2Region
159
+ // subtypes include:
160
+ //
161
+ // S2Cap
162
+ // - a disc-shaped region
163
+ //
164
+ // S2LatLngRect
165
+ // - a rectangle in latitude-longitude coordinates
166
+ //
167
+ // S2Polyline
168
+ // - a polyline
169
+ //
170
+ // S2Polygon
171
+ // - a polygon, possibly with multiple holes and/or shells
172
+ //
173
+ // S2CellUnion
174
+ // - a region approximated as a collection of S2CellIds
175
+ //
176
+ // S2ShapeIndexRegion
177
+ // - an arbitrary collection of points, polylines, and polygons
178
+ //
179
+ // S2ShapeIndexBufferedRegion
180
+ // - like the above, but expanded by a given radius
181
+ //
182
+ // S2RegionUnion, S2RegionIntersection
183
+ // - the union or intersection of arbitrary other regions
184
+ //
185
+ // So for example, if you want to query documents that are within 500 meters
186
+ // of a polyline, you could use an S2ShapeIndexBufferedRegion containing the
187
+ // polyline with a radius of 500 meters.
188
+ //
189
+ // For example usage refer:
190
+ // https://github.com/google/s2geometry/blob/ad1489e898f369ca09e2099353ccd55bd0fd7a26/src/s2/s2region_term_indexer.h#L58
191
+
89
192
type RegionTermIndexer struct {
90
193
options Options
91
194
regionCoverer RegionCoverer
@@ -109,17 +212,21 @@ func NewRegionTermIndexerWithOptions(option Options) *RegionTermIndexer {
109
212
110
213
func (rti * RegionTermIndexer ) GetTerm (termTyp TermType , id CellID ,
111
214
prefix string ) string {
112
- return prefix + id .ToToken ()
113
- /*
114
- TODO - revisit this if needed.
115
- if termTyp == ANCESTOR {
116
- return prefix + id.ToToken()
117
- }
118
- return prefix + marker + id.ToToken()
119
- */
215
+ if termTyp == ANCESTOR {
216
+ return prefix + id .ToToken ()
217
+ }
218
+ return prefix + marker + id .ToToken ()
120
219
}
121
220
122
221
func (rti * RegionTermIndexer ) GetIndexTermsForPoint (p Point , prefix string ) []string {
222
+ // See the top of this file for an overview of the indexing strategy.
223
+ //
224
+ // The last cell generated by this loop is effectively the covering for
225
+ // the given point. You might expect that this cell would be indexed as a
226
+ // covering term, but as an optimization we always index these cells as
227
+ // ancestor terms only. This is possible because query regions will never
228
+ // contain a descendant of such cells. Note that this is true even when
229
+ // max_level() != true_max_level() (see S2RegionCoverer::Options).
123
230
cellID := cellIDFromPoint (p )
124
231
var rv []string
125
232
for l := rti .options .minLevel ; l <= rti .options .maxLevel ; l += rti .options .levelMod {
@@ -141,6 +248,14 @@ func (rti *RegionTermIndexer) GetIndexTermsForRegion(region Region,
141
248
142
249
func (rti * RegionTermIndexer ) GetIndexTermsForCanonicalCovering (
143
250
covering CellUnion , prefix string ) []string {
251
+ // See the top of this file for an overview of the indexing strategy.
252
+ //
253
+ // Cells in the covering are normally indexed as covering terms. If we are
254
+ // optimizing for query time rather than index space, they are also indexed
255
+ // as ancestor terms (since this lets us reduce the number of terms in the
256
+ // query). Finally, as an optimization we always index true_max_level()
257
+ // cells as ancestor cells only, since these cells have the special property
258
+ // that query regions will never contain a descendant of these cells.
144
259
var rv []string
145
260
prevID := CellID (0 )
146
261
tml := rti .options .trueMaxLevel ()
@@ -237,3 +352,17 @@ func CapFromCenterAndRadius(centerLat, centerLon, dist float64) Cap {
237
352
return CapFromCenterAngle (PointFromLatLng (
238
353
LatLngFromDegrees (centerLat , centerLon )), s1 .Angle ((dist / 1000 )/ 6378 ))
239
354
}
355
+
356
+ // FilterOutCoveringTerms filters out the covering terms so that
357
+ // it helps to reduce the search terms while searching in a one
358
+ // dimensional space. (point only indexing usecase)
359
+ func FilterOutCoveringTerms (terms []string ) []string {
360
+ rv := make ([]string , 0 , len (terms ))
361
+ for _ , term := range terms {
362
+ if strings .HasPrefix (term , marker ) {
363
+ continue
364
+ }
365
+ rv = append (rv , term )
366
+ }
367
+ return rv
368
+ }
0 commit comments